From c2ab706524e6a319ed94e4e75f9049e1e7db17db Mon Sep 17 00:00:00 2001
From: jrossthomson <jrossthomson@google.com>
Date: Thu, 12 Sep 2024 17:53:25 -0400
Subject: [PATCH 001/102] Added documentation on cloud-ops-agent installation
 and stackdriver removal

---
 modules/scripts/startup-script/README.md | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md
index dbfb3f8e4c..b9cae7fdee 100644
--- a/modules/scripts/startup-script/README.md
+++ b/modules/scripts/startup-script/README.md
@@ -141,6 +141,8 @@ better performance under some HPC workloads. While official documentation
 recommends using the _Cloud Ops Agent_, it is recommended to use
 `install_stackdriver_agent` when performance is important.
 
+#### Stackdriver Agent Installation
+
 If an image or machine already has Cloud Ops Agent installed and you would like
 to instead use the Stackdrier Agent, the following script will remove the Cloud
 Ops Agent and install the Stackdriver Agent.
@@ -160,6 +162,33 @@ curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh
 sudo bash add-logging-agent-repo.sh --also-install
 sudo service stackdriver-agent start
 ```
+#### Cloud Ops Agent Installation
+
+If an image or machine already has the Stackdriver Agent installed and you would
+like to instead use the Cloud Ops Agent, the following script will remove the
+Stackdriver Agent and install the Cloud Ops Agent.
+
+```bash
+# UnInstall Stackdriver Agent
+
+sudo systemctl stop stackdriver-agent.service
+sudo systemctl disable stackdriver-agent.service
+curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh
+sudo dpkg --configure -a
+sudo bash add-monitoring-agent-repo.sh --uninstall
+sudo bash add-monitoring-agent-repo.sh --remove-repo
+
+# Install ops-agent
+
+curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+sudo bash add-google-cloud-ops-agent-repo.sh --also-install
+sudo service google-cloud-ops-agent start
+```
+
+As a reminder, this should be in a startup script, which should run on all 
+Compute nodes via the `compute_startup_script` on the controller.
+
+#### Testing Installation
 
 You can test if one of the agents is running using the following commands:
 

From 71c5b497af0dfc4f2594eb638e219d62b1a63a5e Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 12 Sep 2024 23:20:49 +0000
Subject: [PATCH 002/102] SlurmGCP. Refactor reservation fetching logic

---
 .../modules/slurm_files/scripts/resume.py     | 31 ++++-------
 .../modules/slurm_files/scripts/util.py       | 51 ++++++++++++-------
 2 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index 7856a5cada..4426d402a4 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -73,43 +73,30 @@ def instance_properties(nodeset, model, placement_group, labels=None):
         props.disks = template_info.disks
 
     if placement_group:
-        props.scheduling = {
-            "onHostMaintenance": "TERMINATE",
-        }
+        props.scheduling.onHostMaintenance = "TERMINATE"
         props.resourcePolicies = [placement_group]
 
-    if nodeset.reservation_name:
-        reservation_name = nodeset.reservation_name
-
-        zones = list(nodeset.zone_policy_allow or [])
-        assert len(zones) == 1, "Only single zone is supported if using a reservation"
-
-        reservation = lookup().reservation(reservation_name, zones[0])
-
+    if reservation := lookup().nodeset_reservation(nodeset):
         props.reservationAffinity = {
             "consumeReservationType": "SPECIFIC_RESERVATION",
             "key": f"compute.{util.universe_domain()}/reservation-name",
-            "values": [reservation_name],
+            "values": [reservation.bulk_insert_name],
         }
 
-        policies = util.reservation_resource_policies(reservation)
-        if policies:
-            props.scheduling = {
-                "onHostMaintenance": "TERMINATE",
-            }
-            props.resourcePolicies = policies
+        if reservation.policies:
+            props.scheduling.onHostMaintenance = "TERMINATE"
+            props.resourcePolicies = reservation.policies
             log.info(
-                f"reservation {reservation_name} is being used with policies {props.resourcePolicies}"
+                f"reservation {reservation.bulk_insert_name} is being used with policies {props.resourcePolicies}"
             )
         else:
             props.resourcePolicies = []
             log.info(
-                f"reservation {reservation_name} is being used without any policies"
+                f"reservation {reservation.bulk_insert_name} is being used without any policies"
             )
 
     if nodeset.maintenance_interval:
-        props.scheduling = props.scheduling or {}
-        props.scheduling["maintenanceInterval"] = nodeset.maintenance_interval
+        props.scheduling.maintenanceInterval = nodeset.maintenance_interval
 
     # Override with properties explicit specified in the nodeset
     props.update(nodeset.get("instance_properties") or {})
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index cb17500d90..eaf455e8dd 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -18,6 +18,7 @@
 import argparse
 import base64
 import collections
+from dataclasses import dataclass
 import hashlib
 import inspect
 import json
@@ -346,16 +347,6 @@ def install_custom_scripts(check_hash=False):
                 blob.download_to_file(f)
             chown_slurm(fullpath, mode=0o755)
 
-
-def reservation_resource_policies(reservation):
-    """
-    Inspects reservation object, returns list of resource policies names.
-    Converts policy URLs to names, e.g.:
-    projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra
-    """
-    return [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()]
-
-
 def compute_service(version="beta"):
     """Make thread-safe compute service handle
     creates a new Http for each request
@@ -1452,6 +1443,13 @@ def delete_node(self, nodename):
             return True
 
 
+@dataclass(frozen=True)
+class ReservationDetails:
+    project: str
+    zone: str
+    policies: List[str] # names (not URLs) of resource policies
+    bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name)
+
 class Lookup:
     """Wrapper class for cached data access"""
 
@@ -1743,20 +1741,37 @@ def instance(self, instance_name: str) -> Optional[object]:
         return self.instances().get(instance_name)
 
     @lru_cache()
-    def reservation(self, name: str, zone: str) -> object:
+    def _get_reservation(self, project: str, zone: str, name: str) -> object:
         """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations"""
+        return self.compute.reservations().get(
+            project=project, zone=zone, reservation=name).execute()
+    
+    def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]:
+        if not nodeset.reservation_name:
+            return None
+        
+        zones = list(nodeset.zone_policy_allow or [])
+        assert len(zones) == 1, "Only single zone is supported if using a reservation"
+        zone = zones[0]
+
         try:
-            _, project, _, short_name = name.split("/")
+            _, project, _, name = nodeset.reservation_name.split("/")
         except ValueError:
             raise ValueError(
-                f"Invalid reservation name: '{name}', expected format is 'projects/PROJECT/reservations/NAME'"
+                f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'"
             )
+        
+        reservation = self._get_reservation(project, zone, name)
 
-        return (
-            self.compute.reservations()
-            .get(project=project, zone=zone, reservation=short_name)
-            .execute()
-        )
+        # Converts policy URLs to names, e.g.:
+        # projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra
+        policies = [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()]
+
+        return ReservationDetails(
+            project=project,
+            zone=zone,
+            policies=policies,
+            bulk_insert_name=nodeset.reservation_name)
 
     @lru_cache(maxsize=1)
     def machine_types(self):

From 2f9667f939efcee183a98608d73413e450432936 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Sat, 21 Sep 2024 04:53:14 +0000
Subject: [PATCH 003/102] Extract `cty.Value` YAML logic from `Dict`.

---
 pkg/config/yaml.go      | 59 +++++++++++++++++++++++++----------------
 pkg/config/yaml_test.go | 19 ++++++++++---
 2 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go
index c23a7bd6ce..855aa8f3db 100644
--- a/pkg/config/yaml.go
+++ b/pkg/config/yaml.go
@@ -313,29 +313,9 @@ func (y *YamlValue) unmarshalTuple(n *yaml.Node) error {
 	return nil
 }
 
-// UnmarshalYAML implements custom YAML unmarshaling.
-func (d *Dict) UnmarshalYAML(n *yaml.Node) error {
-	var v YamlValue
-	if err := n.Decode(&v); err != nil {
-		return err
-	}
-	ty := v.Unwrap().Type()
-	if !ty.IsObjectType() {
-		return nodeToPosErr(n, fmt.Errorf("must be a mapping, got %s", ty.FriendlyName()))
-	}
-
-	for k, w := range v.Unwrap().AsValueMap() {
-		if d.m == nil {
-			d.m = map[string]cty.Value{}
-		}
-		d.m[k] = w
-	}
-	return nil
-}
-
 // MarshalYAML implements custom YAML marshaling.
-func (d Dict) MarshalYAML() (interface{}, error) {
-	o, _ := cty.Transform(d.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) {
+func (y YamlValue) MarshalYAML() (interface{}, error) {
+	m, err := cty.Transform(y.Unwrap(), func(p cty.Path, v cty.Value) (cty.Value, error) {
 		if v.IsNull() {
 			return v, nil
 		}
@@ -358,7 +338,11 @@ func (d Dict) MarshalYAML() (interface{}, error) {
 		return v, nil
 	})
 
-	j := ctyJson.SimpleJSONValue{Value: o}
+	if err != nil {
+		return nil, err
+	}
+
+	j := ctyJson.SimpleJSONValue{Value: m}
 	b, err := j.MarshalJSON()
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal JSON: %v", err)
@@ -371,6 +355,35 @@ func (d Dict) MarshalYAML() (interface{}, error) {
 	return g, nil
 }
 
+// UnmarshalYAML implements custom YAML unmarshaling.
+func (d *Dict) UnmarshalYAML(n *yaml.Node) error {
+	var vm map[string]YamlValue
+	if err := n.Decode(&vm); err != nil {
+		return err
+	}
+
+	for k, v := range vm {
+		if d.m == nil {
+			d.m = map[string]cty.Value{}
+		}
+		d.m[k] = v.Unwrap()
+	}
+	return nil
+}
+
+// MarshalYAML implements custom YAML marshaling.
+func (d Dict) MarshalYAML() (interface{}, error) {
+	m := map[string]interface{}{}
+	for k, v := range d.m {
+		y, err := YamlValue{v}.MarshalYAML()
+		if err != nil {
+			return nil, err
+		}
+		m[k] = y
+	}
+	return m, nil
+}
+
 // yaml.v3 errors are either TypeError - collection of error message or single error message.
 // Parse error messages to extract short error message and position.
 func parseYamlV3Error(err error) error {
diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go
index a6ae3cc6d7..fabad8ca24 100644
--- a/pkg/config/yaml_test.go
+++ b/pkg/config/yaml_test.go
@@ -218,6 +218,7 @@ func TestDictUnmarshalYAML(t *testing.T) {
 	yml := `
 s1: "red"
 s2: pink
+nl:
 m1: {}	
 m2:
   m2f1: green
@@ -229,6 +230,7 @@ m2:
 	want := Dict{}.
 		With("s1", cty.StringVal("red")).
 		With("s2", cty.StringVal("pink")).
+		With("nl", cty.NullVal(cty.DynamicPseudoType)).
 		With("m1", cty.EmptyObjectVal).
 		With("m2", cty.ObjectVal(map[string]cty.Value{
 			"m2f1": cty.StringVal("green"),
@@ -259,14 +261,12 @@ func TestDictWrongTypeUnmarshalYAML(t *testing.T) {
 	if err == nil {
 		t.Errorf("expected error, got nil")
 	}
-	if diff := cmp.Diff(err.Error(), "line 2 column 1: must be a mapping, got number"); diff != "" {
-		t.Errorf("diff (-want +got):\n%s", diff)
-	}
 }
 
 func TestDictMarshalYAML(t *testing.T) {
 	d := Dict{}.
 		With("s1", cty.StringVal("red")).
+		With("nl", cty.NullVal(cty.DynamicPseudoType)).
 		With("m1", cty.EmptyObjectVal).
 		With("m2", cty.ObjectVal(map[string]cty.Value{
 			"m2f1": cty.StringVal("green"),
@@ -280,6 +280,7 @@ func TestDictMarshalYAML(t *testing.T) {
 		}))
 	want := map[string]interface{}{
 		"s1": "red",
+		"nl": nil,
 		"m1": map[string]interface{}{},
 		"m2": map[string]interface{}{
 			"m2f1": "green",
@@ -295,6 +296,18 @@ func TestDictMarshalYAML(t *testing.T) {
 	}
 }
 
+func TestEmptyDictMarshalYAML(t *testing.T) {
+	d := Dict{}
+	want := map[string]interface{}{}
+	got, err := d.MarshalYAML()
+	if err != nil {
+		t.Fatalf("failed to marshal: %v", err)
+	}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("diff (-want +got):\n%s", diff)
+	}
+}
+
 func TestYAMLValueMarshalIntAsInt(t *testing.T) {
 	d := Dict{}.With("zebra", cty.NumberIntVal(5))
 	want := "zebra: 5\n"

From 2514f670bddb6a9a940a210e8911d5e6f7866abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Sat, 21 Sep 2024 12:44:15 +0000
Subject: [PATCH 004/102] Add explicit project references

---
 .../modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf     | 1 +
 community/modules/network/private-service-access/README.md   | 1 +
 community/modules/network/private-service-access/main.tf     | 1 +
 .../modules/network/private-service-access/variables.tf      | 5 +++++
 modules/file-system/parallelstore/main.tf                    | 1 +
 5 files changed, 9 insertions(+)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
index e9ef538f67..224ca76f80 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
@@ -174,6 +174,7 @@ data "google_compute_reservation" "reservation" {
 
 data "google_compute_machine_types" "machine_types_by_zone" {
   for_each = local.zones
+  project  = var.project_id
   filter   = format("name = \"%s\"", var.machine_type)
   zone     = each.value
 }
diff --git a/community/modules/network/private-service-access/README.md b/community/modules/network/private-service-access/README.md
index 50fbd42235..52ef2205c5 100644
--- a/community/modules/network/private-service-access/README.md
+++ b/community/modules/network/private-service-access/README.md
@@ -82,6 +82,7 @@ No modules.
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to supporting resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_network_id"></a> [network\_id](#input\_network\_id) | The ID of the GCE VPC network to configure private service Access.:<br>`projects/<project_id>/global/networks/<network_name>`" | `string` | n/a | yes |
 | <a name="input_prefix_length"></a> [prefix\_length](#input\_prefix\_length) | The prefix length of the IP range allocated for the private service access. | `number` | `16` | no |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | ID of project in which Private Service Access will be created. | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/community/modules/network/private-service-access/main.tf b/community/modules/network/private-service-access/main.tf
index 706fe1cdf7..4bb54821c3 100644
--- a/community/modules/network/private-service-access/main.tf
+++ b/community/modules/network/private-service-access/main.tf
@@ -26,6 +26,7 @@ resource "random_id" "resource_name_suffix" {
 resource "google_compute_global_address" "private_ip_alloc" {
   provider      = google-beta
   name          = "global-psconnect-ip-${random_id.resource_name_suffix.hex}"
+  project       = var.project_id
   purpose       = "VPC_PEERING"
   address_type  = "INTERNAL"
   network       = var.network_id
diff --git a/community/modules/network/private-service-access/variables.tf b/community/modules/network/private-service-access/variables.tf
index e600463e3e..18b73ac2d9 100644
--- a/community/modules/network/private-service-access/variables.tf
+++ b/community/modules/network/private-service-access/variables.tf
@@ -40,3 +40,8 @@ variable "prefix_length" {
   type        = number
   default     = 16
 }
+
+variable "project_id" {
+  description = "ID of project in which Private Service Access will be created."
+  type        = string
+}
diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf
index 56a4069342..3de3b94f3a 100644
--- a/modules/file-system/parallelstore/main.tf
+++ b/modules/file-system/parallelstore/main.tf
@@ -46,6 +46,7 @@ resource "random_id" "resource_name_suffix" {
 }
 
 resource "google_parallelstore_instance" "instance" {
+  project      = var.project_id
   instance_id  = local.id
   location     = var.zone
   capacity_gib = var.size_gb

From 6fd3bc3698d6f13f26959a8c95598ef6ddc31ac2 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 25 Sep 2024 18:26:00 +0000
Subject: [PATCH 005/102] Update reservation for maintenance document for API
 support details

---
 .../schedmd-slurm-gcp-v6-controller/README.md         | 11 +++++++++++
 modules/compute/gke-node-pool/README.md               |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index f9f3edc6b2..b9cb9d6d95 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -105,6 +105,17 @@ run the job outside of the maintenance window.
 srun -n1 -pcompute -t 10:00 <job.sh>
 ```
 
+Currently upcoming maintenance notification is supported in ALPHA version of
+compute API. You can update the API version from your blueprint,
+
+```yaml
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    settings:
+      endpoint_versions:
+        compute: "alpha"
+```
+
 ## Placement Max Distance
 
 When using
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 1ba0757eee..fcf7414af6 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -29,7 +29,7 @@ can be overridden using the `taints` setting. See
 more info.
 
 ### Local SSD Storage
-GKE offers two options for managing locally attached SSDs.  
+GKE offers two options for managing locally attached SSDs.
 
 The first, and recommended, option is for GKE to manage the ephemeral storage
 space on the node, which will then be automatically attached to pods which

From c8b0c00b9a7020de768006415196a1f49ccc3f0c Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Mon, 23 Sep 2024 21:57:32 +0000
Subject: [PATCH 006/102] Changed exact number to minimum for additional vpcs
 in gpu_direct

---
 modules/compute/gke-node-pool/README.md     | 20 ++++++++++----------
 modules/compute/gke-node-pool/gpu_direct.tf | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index fcf7414af6..38815c06dd 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -294,26 +294,26 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index e72d85fd3c..774f1ad12b 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -30,8 +30,9 @@ locals {
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml",              # nccl_configmap
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
-      updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
-      rxdm_version          = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
+      updated_workload_path        = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
+      rxdm_version                 = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
+      required_additional_networks = 4
     }
     "a3-megagpu-8g" = {
       # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
@@ -39,19 +40,18 @@ locals {
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml",    # nccl_plugin v1.0.4 for tcpxo
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
-      updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
-      rxdm_version          = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
+      updated_workload_path        = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
+      rxdm_version                 = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
+      required_additional_networks = 8
     }
   }
+
+  min_additional_networks = try(local.gpu_direct_setting[var.machine_type].min_additional_networks, 0)
 }
 
 check "gpu_direct_check_multi_vpc" {
   assert {
-    condition     = !(var.machine_type == "a3-highgpu-8g" && length(var.additional_networks) != 4)
-    error_message = "To achieve optimal performance for ${var.machine_type} machine, 4 additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as 4"
-  }
-  assert {
-    condition     = !(var.machine_type == "a3-megagpu-8g" && length(var.additional_networks) != 8)
-    error_message = "To achieve optimal performance for ${var.machine_type} machine, 8 additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as 8"
+    condition     = length(var.additional_networks) >= local.min_additional_networks
+    error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}"
   }
 }

From 9b3a60518b07a84fb4ce8cb6a6423683d3eb3d90 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Mon, 23 Sep 2024 23:59:34 +0000
Subject: [PATCH 007/102] Revert auto-updated doc

---
 modules/compute/gke-node-pool/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 38815c06dd..fcf7414af6 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -294,26 +294,26 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |

From 7688f49671c066832b9382478eea63048e829cf0 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Thu, 26 Sep 2024 13:17:12 +0000
Subject: [PATCH 008/102] property name fixed

---
 modules/compute/gke-node-pool/gpu_direct.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 774f1ad12b..45ab72e6e0 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -46,7 +46,7 @@ locals {
     }
   }
 
-  min_additional_networks = try(local.gpu_direct_setting[var.machine_type].min_additional_networks, 0)
+  min_additional_networks = try(local.gpu_direct_setting[var.machine_type].required_additional_networks, 0)
 }
 
 check "gpu_direct_check_multi_vpc" {

From c627c559a606b3684df37cc4132643f3eeba484b Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Thu, 26 Sep 2024 13:18:50 +0000
Subject: [PATCH 009/102] object name fixed

---
 modules/compute/gke-node-pool/gpu_direct.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 45ab72e6e0..fd266c7754 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -46,7 +46,7 @@ locals {
     }
   }
 
-  min_additional_networks = try(local.gpu_direct_setting[var.machine_type].required_additional_networks, 0)
+  min_additional_networks = try(local.gpu_direct_settings[var.machine_type].required_additional_networks, 0)
 }
 
 check "gpu_direct_check_multi_vpc" {

From 62b5c803a8f78403e2e4c7b93d21c87ed3060731 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 08:18:15 +0000
Subject: [PATCH 010/102] variable name  updated

---
 modules/compute/gke-node-pool/gpu_direct.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index fd266c7754..23c370edf2 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -32,7 +32,7 @@ locals {
       ]
       updated_workload_path        = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
       rxdm_version                 = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
-      required_additional_networks = 4
+      min_additional_networks = 4
     }
     "a3-megagpu-8g" = {
       # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
@@ -42,11 +42,11 @@ locals {
       ]
       updated_workload_path        = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
       rxdm_version                 = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
-      required_additional_networks = 8
+      min_additional_networks = 8
     }
   }
 
-  min_additional_networks = try(local.gpu_direct_settings[var.machine_type].required_additional_networks, 0)
+  min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0)
 }
 
 check "gpu_direct_check_multi_vpc" {

From 362dbb939bbe50a02cfb6565639a1df02570b201 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 10:18:29 +0000
Subject: [PATCH 011/102] style modification

---
 modules/compute/gke-node-pool/gpu_direct.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 23c370edf2..b22c353f69 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -30,8 +30,8 @@ locals {
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml",              # nccl_configmap
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
-      updated_workload_path        = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
-      rxdm_version                 = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
+      updated_workload_path   = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
+      rxdm_version            = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
       min_additional_networks = 4
     }
     "a3-megagpu-8g" = {
@@ -40,8 +40,8 @@ locals {
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml",    # nccl_plugin v1.0.4 for tcpxo
         "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin
       ]
-      updated_workload_path        = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
-      rxdm_version                 = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
+      updated_workload_path   = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
+      rxdm_version            = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
       min_additional_networks = 8
     }
   }

From 07d04bafa778174bc126352bfea7873006ca4d0d Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Mon, 23 Sep 2024 22:16:13 +0000
Subject: [PATCH 012/102] resource-policy module implemented

---
 modules/README.md                             |  2 +
 .../resource-policy/resource-policy/README.md | 59 +++++++++++++++++++
 .../resource-policy/resource-policy/main.tf   | 31 ++++++++++
 .../resource-policy/metadata.yaml             | 19 ++++++
 .../resource-policy/outputs.tf                | 28 +++++++++
 .../resource-policy/variables.tf              | 40 +++++++++++++
 .../resource-policy/versions.tf               | 30 ++++++++++
 7 files changed, 209 insertions(+)
 create mode 100644 modules/compute/resource-policy/resource-policy/README.md
 create mode 100644 modules/compute/resource-policy/resource-policy/main.tf
 create mode 100644 modules/compute/resource-policy/resource-policy/metadata.yaml
 create mode 100644 modules/compute/resource-policy/resource-policy/outputs.tf
 create mode 100644 modules/compute/resource-policy/resource-policy/variables.tf
 create mode 100644 modules/compute/resource-policy/resource-policy/versions.tf

diff --git a/modules/README.md b/modules/README.md
index d9ba636393..defba11446 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -49,6 +49,7 @@ Modules that are still in development and less stable are labeled with the
   Creates a dynamic nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module and instance template.
 * **[gke-node-pool]** ![core-badge] ![experimental-badge] : Creates a
   Kubernetes node pool using GKE.
+* **[resource-policy]** ![core-badge] ![experimental-badge] : Create a resource policy for compute engines that can be applied to gke-node-pool's nodes.
 * **[gke-job-template]** ![core-badge] ![experimental-badge] : Creates a
   Kubernetes job file to be used with a [gke-node-pool].
 * **[htcondor-execute-point]** ![community-badge] ![experimental-badge] :
@@ -62,6 +63,7 @@ Modules that are still in development and less stable are labeled with the
 
 [vm-instance]: compute/vm-instance/README.md
 [gke-node-pool]: ../modules/compute/gke-node-pool/README.md
+[resource-policy]: ../modules/compute/resource-policy/README.md
 [gke-job-template]: ../modules/compute/gke-job-template/README.md
 [schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
 [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md
new file mode 100644
index 0000000000..7f558f9696
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/README.md
@@ -0,0 +1,59 @@
+## Description
+
+This modules create a resource policy for compute engines. This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes.
+
+Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your Technical Account Manager (TAM) or the Sales team.
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
+| <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | ~> 5.0 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | ~> 5.0 |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [google-beta_google_compute_resource_policy.policy](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_resource_policy) | resource |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_group_placement_max_distance"></a> [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no |
+| <a name="input_name"></a> [name](#input\_name) | The resource policy's name. | `string` | n/a | yes |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID for the resource policy. | `string` | n/a | yes |
+| <a name="input_region"></a> [region](#input\_region) | The region for the the resource policy. | `string` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_placement_policy"></a> [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/compute/resource-policy/resource-policy/main.tf b/modules/compute/resource-policy/resource-policy/main.tf
new file mode 100644
index 0000000000..5adce37f0f
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/main.tf
@@ -0,0 +1,31 @@
+/**
+  * Copyright 2024 Google LLC
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *      http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+resource "google_compute_resource_policy" "policy" {
+  name     = var.name
+  region   = var.region
+  project  = var.project_id
+  provider = google-beta
+
+  dynamic "group_placement_policy" {
+    for_each = var.group_placement_max_distance > 0 ? [1] : []
+
+    content {
+      collocation  = "COLLOCATED"
+      max_distance = var.group_placement_max_distance
+    }
+  }
+}
diff --git a/modules/compute/resource-policy/resource-policy/metadata.yaml b/modules/compute/resource-policy/resource-policy/metadata.yaml
new file mode 100644
index 0000000000..4c2f23a8d7
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/metadata.yaml
@@ -0,0 +1,19 @@
+# Copyright 2023 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+spec:
+  requirements:
+    services:
+    - compute.googleapis.com
diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/resource-policy/outputs.tf
new file mode 100644
index 0000000000..78872433d8
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/outputs.tf
@@ -0,0 +1,28 @@
+/**
+  * Copyright 2024 Google LLC
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *      http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+output "placement_policy" {
+  description = <<-EOT
+  Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
+  It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
+  Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions.
+  EOT
+
+  value = {
+    type = var.group_placement_max_distance > 0 ? "COMPACT" : null
+    name = var.group_placement_max_distance > 0 ? var.name : null
+  }
+}
diff --git a/modules/compute/resource-policy/resource-policy/variables.tf b/modules/compute/resource-policy/resource-policy/variables.tf
new file mode 100644
index 0000000000..b2841394d1
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/variables.tf
@@ -0,0 +1,40 @@
+/**
+  * Copyright 2024 Google LLC
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *      http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+variable "project_id" {
+  description = "The project ID for the resource policy."
+  type        = string
+}
+
+variable "region" {
+  description = "The region for the the resource policy."
+  type        = string
+}
+
+variable "name" {
+  description = "The resource policy's name."
+  type        = string
+}
+
+variable "group_placement_max_distance" {
+  description = <<-EOT
+  The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.
+  Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions.
+  EOT
+
+  type    = number
+  default = 0
+}
diff --git a/modules/compute/resource-policy/resource-policy/versions.tf b/modules/compute/resource-policy/resource-policy/versions.tf
new file mode 100644
index 0000000000..4b7b6158c9
--- /dev/null
+++ b/modules/compute/resource-policy/resource-policy/versions.tf
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+terraform {
+  required_providers {
+    google-beta = {
+      source  = "hashicorp/google-beta"
+      version = "~> 5.0"
+    }
+  }
+
+  provider_meta "google" {
+    module_name = "blueprints/terraform/hpc-toolkit:resource-policy/v1.37.2"
+  }
+
+  required_version = ">= 1.3"
+}

From 688e8ee697afdb21189cf5fdcab7175d6596e8a0 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Tue, 24 Sep 2024 00:22:33 +0000
Subject: [PATCH 013/102] fix doc br tags

---
 modules/compute/resource-policy/resource-policy/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md
index 7f558f9696..1a4bf79823 100644
--- a/modules/compute/resource-policy/resource-policy/README.md
+++ b/modules/compute/resource-policy/resource-policy/README.md
@@ -46,7 +46,7 @@ No modules.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_group_placement_max_distance"></a> [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no |
+| <a name="input_group_placement_max_distance"></a> [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no |
 | <a name="input_name"></a> [name](#input\_name) | The resource policy's name. | `string` | n/a | yes |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID for the resource policy. | `string` | n/a | yes |
 | <a name="input_region"></a> [region](#input\_region) | The region for the the resource policy. | `string` | n/a | yes |
@@ -55,5 +55,5 @@ No modules.
 
 | Name | Description |
 |------|-------------|
-| <a name="output_placement_policy"></a> [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
+| <a name="output_placement_policy"></a> [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->

From 7759b51e97171c6d555466ac5116056b857f873b Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Thu, 26 Sep 2024 14:05:55 +0000
Subject: [PATCH 014/102] module doc updated

---
 .../resource-policy/resource-policy/README.md | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md
index 1a4bf79823..e1c5222adb 100644
--- a/modules/compute/resource-policy/resource-policy/README.md
+++ b/modules/compute/resource-policy/resource-policy/README.md
@@ -1,8 +1,28 @@
 ## Description
 
-This modules create a resource policy for compute engines. This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes.
+This modules create a [resource policy for compute engines](https://cloud.google.com/compute/docs/instances/placement-policies-overview). This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes.
+
+Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your [Technical Account Manager (TAM)](https://cloud.google.com/tam) or the [Sales team](https://cloud.google.com/contact).
+
+### Example
+
+The following example creates a group placement resource policy and applies it to a gke-node-pool.
+
+```yaml
+  - id: group_placement_1
+    source: modules/compute/resource-policy
+    settings:
+      name: gp-np-1
+      group_placement_max_distance: 2
+
+  - id: node_pool_1
+    source: modules/compute/gke-node-pool
+    use: [group_placement_1]
+    settings:
+      machine_type: e2-standard-8
+    outputs: [instructions]
+```
 
-Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your Technical Account Manager (TAM) or the Sales team.
 
 <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 Copyright 2024 Google LLC

From a6701560a7efac5797c933714ffd3ee63c5aca69 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Thu, 26 Sep 2024 14:08:15 +0000
Subject: [PATCH 015/102] minor doc updated

---
 modules/compute/resource-policy/resource-policy/README.md  | 2 +-
 modules/compute/resource-policy/resource-policy/outputs.tf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md
index e1c5222adb..d01fc6f944 100644
--- a/modules/compute/resource-policy/resource-policy/README.md
+++ b/modules/compute/resource-policy/resource-policy/README.md
@@ -75,5 +75,5 @@ No modules.
 
 | Name | Description |
 |------|-------------|
-| <a name="output_placement_policy"></a> [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
+| <a name="output_placement_policy"></a> [placement\_policy](#output\_placement\_policy) | Group placement policy to use for placing VMs or GKE nodes placement. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/resource-policy/outputs.tf
index 78872433d8..64e4275c91 100644
--- a/modules/compute/resource-policy/resource-policy/outputs.tf
+++ b/modules/compute/resource-policy/resource-policy/outputs.tf
@@ -16,7 +16,7 @@
 
 output "placement_policy" {
   description = <<-EOT
-  Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
+  Group placement policy to use for placing VMs or GKE nodes placement. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
   It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
   Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions.
   EOT

From a1836bbb957880ee48daadab2f2f4084489bd881 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 11:21:11 +0000
Subject: [PATCH 016/102] style fix

---
 modules/compute/resource-policy/resource-policy/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md
index d01fc6f944..f3f00e3437 100644
--- a/modules/compute/resource-policy/resource-policy/README.md
+++ b/modules/compute/resource-policy/resource-policy/README.md
@@ -23,7 +23,6 @@ The following example creates a group placement resource policy and applies it t
     outputs: [instructions]
 ```
 
-
 <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 Copyright 2024 Google LLC
 

From 85e3ce834297860abf0faeba0231e3a27e4cd2d0 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 11:52:11 +0000
Subject: [PATCH 017/102] Added compatibility check for GPUDirect and GKE
 version

---
 modules/compute/gke-node-pool/README.md       |  1 +
 modules/compute/gke-node-pool/gpu_direct.tf   | 28 +++++++++++++++++++
 modules/compute/gke-node-pool/variables.tf    |  5 ++++
 modules/scheduler/gke-cluster/README.md       |  1 +
 modules/scheduler/gke-cluster/outputs.tf      |  5 ++++
 .../pre-existing-gke-cluster/README.md        |  1 +
 .../pre-existing-gke-cluster/outputs.tf       |  5 ++++
 7 files changed, 46 insertions(+)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index fcf7414af6..7b1cffbf68 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -294,6 +294,7 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
+| <a name="input_gke_master_version"></a> [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index b22c353f69..4fef57e914 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -33,6 +33,12 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
       rxdm_version            = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
       min_additional_networks = 4
+      min_gke_versions = {
+        "1.27" = "1.27.7-gke.1121000"
+        "1.28" = "1.28.8-gke.1095000"
+        "1.29" = "1.29.3-gke.1093000"
+        "1.30" = "1.30.2-gke.1023000"
+      }
     }
     "a3-megagpu-8g" = {
       # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines
@@ -43,10 +49,25 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
       rxdm_version            = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
       min_additional_networks = 8
+      min_gke_versions = {
+        "1.28" = "1.28.9-gke.1250000"
+        "1.29" = "1.29.4-gke.1542000"
+        "1.30" = "1.30.4-gke.1129000"
+      }
     }
   }
 
   min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0)
+
+  gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"]
+
+  gke_version_parts = regex(local.gke_version_regex, var.gke_master_version)
+  gke_version_major = local.gke_version_parts[0]
+
+  min_gke_versions         = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null)
+  min_version              = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0"
+  min_version_parts        = regex(local.gke_version_regex, local.min_version)
+  gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2])
 }
 
 check "gpu_direct_check_multi_vpc" {
@@ -55,3 +76,10 @@ check "gpu_direct_check_multi_vpc" {
     error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}"
   }
 }
+
+check "gke_master_version_requirements" {
+  assert {
+    condition     = local.gke_gpudirect_compatible
+    error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
+  }
+}
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index ef1277744f..62160a2448 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -360,3 +360,8 @@ variable "initial_node_count" {
   type        = number
   default     = null
 }
+
+variable "gke_master_version" {
+  description = "GKE master version"
+  type        = string
+}
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 3a72e1149b..4548db2fc9 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -194,6 +194,7 @@ limitations under the License.
 |------|-------------|
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
+| <a name="output_gke_master_version"></a> [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. |
 | <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
 | <a name="output_k8s_service_account_name"></a> [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf
index 53ee068ca2..4daed8ee25 100644
--- a/modules/scheduler/gke-cluster/outputs.tf
+++ b/modules/scheduler/gke-cluster/outputs.tf
@@ -74,3 +74,8 @@ output "k8s_service_account_name" {
   description = "Name of k8s service account."
   value       = one(module.workload_identity[*].k8s_service_account_name)
 }
+
+output "gke_master_version" {
+  description = "GKE cluster's master version."
+  value       = google_container_cluster.gke_cluster.master_version
+}
diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md
index 519715480d..1f2904d889 100644
--- a/modules/scheduler/pre-existing-gke-cluster/README.md
+++ b/modules/scheduler/pre-existing-gke-cluster/README.md
@@ -111,4 +111,5 @@ limitations under the License.
 |------|-------------|
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
+| <a name="output_gke_master_version"></a> [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
index 9bfd571b61..90772d3dae 100644
--- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
@@ -26,3 +26,8 @@ output "gke_cluster_exists" {
     data.google_container_cluster.existing_gke_cluster
   ]
 }
+
+output "gke_master_version" {
+  description = "GKE cluster's master version."
+  value       = data.google_container_cluster.existing_gke_cluster.master_version
+}

From 2709bde8b252959f1768e6074c9f1aca2fd68a97 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 13:29:52 +0000
Subject: [PATCH 018/102] gke-topology-scheduler module implemented

---
 .../compute/gke-topology-scheduler/README.md  |  54 ++
 .../compute/gke-topology-scheduler/main.tf    |  27 +
 .../manifests/label-nodes-daemon.yaml         |  49 ++
 .../manifests/schedule-daemon.yaml            |  48 ++
 .../manifests/service-account.yaml            |  47 ++
 .../manifests/topology-scheduler-scripts.yaml | 546 ++++++++++++++++++
 .../gke-topology-scheduler/variables.tf       |  23 +
 .../gke-topology-scheduler/versions.tf        |  21 +
 8 files changed, 815 insertions(+)
 create mode 100644 community/modules/compute/gke-topology-scheduler/README.md
 create mode 100644 community/modules/compute/gke-topology-scheduler/main.tf
 create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml
 create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
 create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml
 create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
 create mode 100644 community/modules/compute/gke-topology-scheduler/variables.tf
 create mode 100644 community/modules/compute/gke-topology-scheduler/versions.tf

diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md
new file mode 100644
index 0000000000..ad4ea32cbd
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/README.md
@@ -0,0 +1,54 @@
+## Description
+
+This module enables topology on a Google Kubernetes Engine cluster.
+This is implemented based on sources and instructions explained [here](https://github.com/GoogleCloudPlatform/container-engine-accelerators/tree/master/gpudirect-tcpxo/topology-scheduler).
+
+## Prerequisites
+
+For topology awareness to be enabled, a GKE node pool has to be created with
+compact placement. Specifically, the `physical_host` attribute
+[ref](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location)
+should be present for each GPU node in the cluster.
+
+### Example
+
+The following example installs topology scheduler on a GKE cluster.
+
+```yaml
+  - id: topology_aware_scheduler_install
+    source: community/modules/compute/gke-topology-scheduler
+    use: [gke_cluster]
+```
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
+
+## Providers
+
+No providers.
+
+## Modules
+
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_kubectl_apply"></a> [kubectl\_apply](#module\_kubectl\_apply) | ../../../../modules/management/kubectl-apply | n/a |
+
+## Resources
+
+No resources.
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
+
+## Outputs
+
+No outputs.
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/compute/gke-topology-scheduler/main.tf b/community/modules/compute/gke-topology-scheduler/main.tf
new file mode 100644
index 0000000000..677595632b
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/main.tf
@@ -0,0 +1,27 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module "kubectl_apply" {
+  source = "../../../../modules/management/kubectl-apply"
+
+  cluster_id = var.cluster_id
+  project_id = var.project_id
+
+  apply_manifests = [
+    { source = "${path.module}/manifests/topology-scheduler-scripts.yaml" },
+    { source = "${path.module}/manifests/service-account.yaml" },
+    { source = "${path.module}/manifests/label-nodes-daemon.yaml" },
+    { source = "${path.module}/manifests/schedule-daemon.yaml" }
+  ]
+}
diff --git a/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml
new file mode 100644
index 0000000000..fe49c607a6
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml
@@ -0,0 +1,49 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: label-nodes-daemon
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: label-nodes-daemon
+  template:
+    metadata:
+      labels:
+        name: label-nodes-daemon
+    spec:
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      hostNetwork: true
+      containers:
+      - name: label-nodes-daemon
+        image: python:3.9
+        command:
+        - bash
+        - -c
+        - |
+          pip install kubernetes
+          python -u /scripts/label-nodes-daemon.py
+        volumeMounts:
+        - name: scripts-volume
+          mountPath: /scripts
+      volumes:
+      - name: scripts-volume
+        configMap:
+          name: topology-scheduler-scripts
+      serviceAccount: topology-scheduler
diff --git a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
new file mode 100644
index 0000000000..b412f936e9
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
@@ -0,0 +1,48 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: topology-scheduler
+  labels:
+    app: topology-scheduler
+  namespace: kube-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: topology-scheduler
+  template:
+    metadata:
+      labels:
+        app: topology-scheduler
+    spec:
+      tolerations:
+      - key: "node-role.kubernetes.io/control-plane"
+        operator: "Exists"
+        effect: "NoSchedule"
+      containers:
+      - name: topology-scheduler-container
+        image: python:3.9
+        command: ["/bin/sh", "-c", "pip install google-auth google-api-python-client kubernetes; python /scripts/schedule-daemon.py --ignored-namespace kube-system gmp-public gmp-system"]
+        volumeMounts:
+        - name: scripts-volume
+          mountPath: /scripts
+      volumes:
+      - name: scripts-volume
+        configMap:
+          name: topology-scheduler-scripts
+      serviceAccount: topology-scheduler
+      restartPolicy: Always
diff --git a/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml b/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml
new file mode 100644
index 0000000000..61834ced8f
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml
@@ -0,0 +1,47 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: topology-scheduler
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: topology-scheduler
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list", "update", "patch"]
+- apiGroups: [""]
+  resources: ["namespaces"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["get", "list", "watch", "update", "patch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: topology-scheduler
+subjects:
+- kind: ServiceAccount
+  name: topology-scheduler
+  namespace: kube-system
+roleRef:
+  kind: ClusterRole
+  name: topology-scheduler
+  apiGroup: rbac.authorization.k8s.io
diff --git a/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
new file mode 100644
index 0000000000..96b4a89b34
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml
@@ -0,0 +1,546 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: topology-scheduler-scripts
+  namespace: kube-system
+data:
+  schedule-daemon.py: |
+    #!/usr/bin/env python
+
+    # Copyright 2024 Google Inc. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+
+    import argparse
+    from itertools import groupby
+    import time
+    import kubernetes
+    import kubernetes.client
+    from kubernetes.client.rest import ApiException
+    from kubernetes.utils.quantity import parse_quantity
+
+
+    def split_pods_based_on_jobs(pods):
+      """Splits pending pods into groups based on jobs."""
+      return [
+          list(job_group)
+          for _, job_group in groupby(pods, lambda pod: pod.get('job_name'))
+      ]
+
+
+    def sort_jobs_by_time(job):
+      """Return the key to be used for sorting jobs which is by creation time."""
+      # All the pods in the job should have the same creation time.
+      return job[0].get('creation_time')
+
+
+    def pod_sorting_key(pod):
+      """Returns key to be used for sorting pods.
+      Given that numbers is often suffixed for multi-node deployments,
+      here we use a (prefix, number) tuple for the sorting key.
+      This means "xxx-pod2" should appear before "xxx-pod10"
+      """
+
+      if pod['index'] is not None:
+        return int(pod['index'])
+
+      # if the suffix is a number, extract it
+      idx = 0
+      suffix = ""
+      name = pod['name']
+      while name[-1 - len(suffix)].isdigit():
+        suffix = name[-1 - len(suffix)] + suffix
+
+      if suffix != "":
+        idx = int(suffix)
+
+      return (name[:len(name) - len(suffix)], idx)
+
+
+    def node_topology_distance(node1, node2):
+      node1_key = node_topology_key(node1)
+      node2_key = node_topology_key(node2)
+      result = 1000000
+      for i in range(len(node1_key)):
+        if node1_key[i] != node2_key[i]:
+          return result
+        result /= 100
+      return 0
+
+
+    def node_topology_key(node):
+      """Builds a key to be used to sort nodes."""
+      node_labels = node['node_labels']
+
+      if (
+          'cloud.google.com/gke-placement-group' in node_labels
+          and 'topology.gke.io/cluster' in node_labels
+          and 'topology.gke.io/rack' in node_labels
+          and 'topology.gke.io/host' in node_labels
+      ):
+        return (
+            node_labels['cloud.google.com/gke-placement-group'],
+            node_labels['topology.gke.io/cluster'],
+            node_labels['topology.gke.io/rack'],
+            node_labels['topology.gke.io/host'],
+        )
+
+      return ()
+
+
+    def get_pod_used_resources(pod):
+      """Get the resources used by this pod"""
+      used_cpu = 0
+      used_memory = 0
+      used_gpu = 0
+      if pod.status is None or pod.status.container_statuses is None:
+        return used_cpu, used_memory, used_gpu
+      for container, container_status in zip(pod.spec.containers, pod.status.container_statuses):
+        if container_status.state.terminated is not None:
+          # terminated pods don't use resources
+          continue
+        requests = container.resources.requests or {}
+        used_cpu += parse_quantity(requests.get('cpu', 0))
+        used_memory += parse_quantity(requests.get('memory', 0))
+        used_gpu += int(requests.get('nvidia.com/gpu', 0))
+      return used_cpu, used_memory, used_gpu
+
+
+    def get_pods_taint_toleration(pods):
+      """Get the taint tolerations of the pods.
+      For simplicity, we assume that the pods are homogeneous and
+      all have the same tolerations.
+      """
+      ts = None
+      for pod in pods:
+        tolerations = pod['spec'].tolerations
+        if ts is None:
+          ts = tolerations
+        else:
+          assert(ts == tolerations)
+      return ts if ts is not None else []
+
+
+    def find_schedulable_nodes(nodes, pods, tolerated_taints):
+      """Finds nodes that can be scheduled."""
+      nodes_info = {}
+
+      if tolerated_taints is not None:
+        tolerated_taint_dict = {t.key: t for t in tolerated_taints}
+      else:
+        tolerated_taint_dict = {}
+
+      for node in nodes:
+        node_name = node.metadata.name
+        node_labels = node.metadata.labels
+
+        if 'cloud.google.com/gke-placement-group' not in node_labels:
+          print(
+              f'Skipping node {node_name} because it does not have topology'
+              ' metadata'
+          )
+          continue
+
+        skip_node = False
+        if node.spec.taints is not None:
+          for t in node.spec.taints:
+            if t.key not in tolerated_taint_dict:
+              print(f'Skipping node {node_name} because it is tainted with key {t.key}')
+              skip_node = True
+              break
+            else:
+              tol = tolerated_taint_dict[t.key]
+              if tol.operator == "Equal" and tol.value != t.value:
+                skip_node = True
+                break
+
+        if skip_node:
+          continue
+
+        allocatable = node.status.allocatable
+
+        used_cpu = 0
+        used_memory = 0
+        used_gpu = 0
+
+        for pod in pods:
+          if pod.spec.node_name == node_name:
+            cpu, mem, gpu = get_pod_used_resources(pod)
+            used_cpu += cpu
+            used_memory += mem
+            used_gpu += gpu
+
+        free_cpu = parse_quantity(allocatable['cpu']) - used_cpu
+        free_memory = parse_quantity(allocatable['memory']) - used_memory
+        free_gpu = int(allocatable.get('nvidia.com/gpu', 0)) - used_gpu
+
+        node_info = {
+            'name': node_name,
+            'cpu': free_cpu,
+            'memory': free_memory,
+            'gpu': free_gpu,
+            'node_labels': node_labels,
+        }
+        nodes_info[node_name] = node_info
+
+        print(
+            f'Node: {node_name}, CPU: {free_cpu}, Memory: {free_memory}, GPU:'
+            f' {free_gpu}, Topology: {node_topology_key(node_info)}'
+        )
+
+      return nodes_info
+
+
+    def find_pod_gates(pods, prefix):
+      """Finds pods with scheduling gates that starts with the prefix"""
+      s = set()
+      for pod in pods:
+        if pod.spec.scheduling_gates:
+          for g in pod.spec.scheduling_gates:
+            if g.name.startswith(prefix):
+              s.add(g.name)
+      return s
+
+
+    def find_schedulable_pods(pods, gate_name):
+      """Finds pods that can be scheduled."""
+      pods_to_schedule = {}
+
+      for pod in pods:
+        if pod.spec.scheduling_gates:
+          gates = pod.spec.scheduling_gates
+          for gate in gates:
+            if gate.name == gate_name:
+              pod_name = pod.metadata.name
+              pod_namespace = pod.metadata.namespace
+
+              pod_index = None
+              job_name = None
+              if pod.metadata.labels is not None:
+                if (
+                    'batch.kubernetes.io/job-completion-index'
+                    in pod.metadata.labels
+                ):
+                  pod_index = pod.metadata.labels[
+                      'batch.kubernetes.io/job-completion-index'
+                  ]
+                else:
+                  print('Unable to find index in metadata. Can not queue jobs')
+
+                if 'job-name' in pod.metadata.labels:
+                  job_name = pod.metadata.labels['job-name']
+                else:
+                  print('Unable to find job_name in metadata. Can not queue jobs')
+              else:
+                print('No labels on pod to extract job metadata from.')
+
+              creation_time = None
+              if pod.metadata.creation_timestamp is not None:
+                creation_time = pod.metadata.creation_timestamp
+              else:
+                print(
+                    'Unable to find creation_time in metadata. Can not queue jobs'
+                )
+
+              used_cpu = 0
+              used_memory = 0
+              used_gpu = 0
+
+              for container in pod.spec.containers:
+                requests = container.resources.requests or {}
+                used_cpu += parse_quantity(requests.get('cpu', 0))
+                used_memory += parse_quantity(requests.get('memory', 0))
+                used_gpu += int(requests.get('nvidia.com/gpu', 0))
+
+              pods_to_schedule[pod_name] = {
+                  'name': pod_name,
+                  'namespace': pod_namespace,
+                  'index': pod_index,
+                  'cpu': used_cpu,
+                  'memory': used_memory,
+                  'gpu': used_gpu,
+                  'node_selector': pod.spec.node_selector,
+                  'spec': pod.spec,
+                  'metadata': pod.metadata,
+                  'job_name': job_name,
+                  'creation_time': creation_time
+              }
+
+              print(
+                  f'Found schedulable pod: {pod_namespace}/{pod_name}, CPU:'
+                  f' {used_cpu}, Memory: {used_memory}, GPU: {used_gpu}'
+                  f' Index: {pod_index}'
+              )
+
+      return pods_to_schedule
+
+
+    def can_schedule(node, pod):
+      """Checks if a given pod can be scheduled on a given node."""
+      node_selector = pod['node_selector']
+      node_labels = node['node_labels']
+
+      if node_selector:
+        for key, value in node_selector.items():
+          if key not in node_labels or node_labels[key] != value:
+            return False
+
+      return (
+          node['cpu'] >= pod['cpu']
+          and node['memory'] >= pod['memory']
+          and node['gpu'] >= pod['gpu']
+      )
+
+
+    def schedule_pod_on_node(v1, pod_name, pod_namespace, node_name, gate_name):
+      """Schedules a pod on a given node."""
+      try:
+        pod = v1.read_namespaced_pod(pod_name, pod_namespace)
+
+        if any(gate.name == gate_name for gate in pod.spec.scheduling_gates):
+          new_gates = [
+              gate for gate in pod.spec.scheduling_gates if gate.name != gate_name
+          ]
+          pod.spec.affinity = {
+              'nodeAffinity': {
+                  'requiredDuringSchedulingIgnoredDuringExecution': {
+                      'nodeSelectorTerms': [{
+                          'matchExpressions': [{
+                              'key': 'kubernetes.io/hostname',
+                              'operator': 'In',
+                              'values': [node_name],
+                          }]
+                      }]
+                  }
+              }
+          }
+          pod.spec.scheduling_gates = new_gates
+
+          v1.replace_namespaced_pod(pod_name, pod_namespace, pod)
+
+          print(f'Pod {pod_namespace}/{pod_name} scheduled on {node_name}')
+      except ApiException as e:
+        print(f'Exception when removing scheduling gate: {e}')
+
+
+    def calculate_pods_assignment(sorted_nodes, sorted_pods):
+      """Calculates the best assignment for pods."""
+      assignment = [-i for i in reversed(range(1, len(sorted_pods) + 1))]
+      best_assignment = []
+      minimum_distance = 1000000000
+
+      while True:
+        all_ok = True
+        i = len(assignment) - 1
+        while i >= 0 and all_ok:
+          assignment[i] += 1
+          if assignment[i] == len(sorted_nodes):
+            break
+          if assignment[i] >= 0 and can_schedule(
+              sorted_nodes[assignment[i]], sorted_pods[i]
+          ):
+            i -= 1
+          elif i < len(assignment) - 1 and assignment[i] == assignment[i + 1] - 1:
+            all_ok = False
+        if assignment[-1] == len(sorted_nodes):
+          break
+        if all_ok:
+          new_distance = 0
+          for i in range(1, len(sorted_pods)):
+            new_distance += node_topology_distance(
+                sorted_nodes[assignment[i]], sorted_nodes[assignment[i - 1]]
+            )
+          if new_distance < minimum_distance:
+            best_assignment = assignment.copy()
+            minimum_distance = new_distance
+
+      return best_assignment
+
+
+    def schedule_pod_with_gate(v1, pods, gate):
+      pods_to_schedule = find_schedulable_pods(pods, gate)
+
+      nodes = v1.list_node().items
+      print(f'Pods to schedule: {len(pods_to_schedule)}')
+      jobs = split_pods_based_on_jobs(pods_to_schedule.values())
+      sorted_jobs = sorted(jobs, key=sort_jobs_by_time)
+      for job in sorted_jobs:
+        job_name = job[0].get('job_name')
+        creation_time = job[0].get('creation_time')
+        print(f'Attempting to schedule job: {job_name} created: {creation_time}')
+
+        tolerated_taints = get_pods_taint_toleration(job)
+        nodes_to_schedule = find_schedulable_nodes(nodes, pods, tolerated_taints)
+
+        sorted_pods = sorted(job, key=pod_sorting_key)
+        sorted_nodes = sorted(nodes_to_schedule.values(), key=node_topology_key)
+
+        print(f'Nodes to schedule: {len(nodes_to_schedule)}')
+
+        best_assignment = calculate_pods_assignment(sorted_nodes, sorted_pods)
+
+        if not best_assignment:
+          print(
+              f'No scheduling for job: {job_name} with gate {gate} has been found.'
+              ' Skipping job.'
+          )
+          continue
+        else:
+          print(f'Assignment found, scheduling {job_name} with {len(jobs)} pods.')
+
+        for i in range(0, len(sorted_pods)):
+          pod = sorted_pods[i]
+          node = sorted_nodes[best_assignment[i]]
+          schedule_pod_on_node(
+              v1, pod['name'], pod['namespace'], node['name'], gate
+          )
+
+
+    def run_scheduling_loop():
+      """Runs scheduling."""
+      parser = argparse.ArgumentParser(
+          prog='schedule-workload.py')
+
+      parser.add_argument(
+          '-g', '--gate',
+          default='gke.io/topology-aware-auto-')    # prefix of the schedule gate
+      parser.add_argument(
+          '-i', '--interval',
+          default=1.0)    # intervals (in seconds) between scheduling
+      parser.add_argument(
+          '--ignored-namespace',
+          nargs='*',
+          default=[])     # namespace to search for pods
+      args = parser.parse_args()
+
+      try:
+        kubernetes.config.load_incluster_config()
+      except kubernetes.config.ConfigException:
+        kubernetes.config.load_kube_config()
+      v1 = kubernetes.client.CoreV1Api()
+
+      def list_pods():
+        # filtering of namespace is not cached as namespaces could be
+        # created and deleted
+        namespaces = v1.list_namespace().items
+        filtered_namespace_names = []
+        for n in namespaces:
+          if n.metadata.name not in args.ignored_namespace:
+            filtered_namespace_names.append(n.metadata.name)
+        pods = []
+        for n in filtered_namespace_names:
+          pods += v1.list_namespaced_pod(n).items
+        return pods
+
+      try:
+        t0 = time.time()
+        while True:
+          interval = time.time() - t0
+          if interval < args.interval:
+            time.sleep(args.interval - interval)
+          t0 = time.time()
+
+          pods = list_pods()
+
+          gates = find_pod_gates(pods, args.gate)
+          print(f"Found {len(pods)} pods and {len(gates)} gates")
+
+          if len(gates) == 0:
+            # No pods to be scheduled
+            continue
+
+          # sleep for one seconds, assuming that all pods within one group would be
+          # all visible by then
+          time.sleep(5.0)
+
+          for g in gates:
+            print(f"scheduling pods with gate {g}")
+            # query the pods again after the sleep, just in case not all gated pods
+            # are returned from previous query
+            pods = list_pods()
+            schedule_pod_with_gate(v1, pods, g)
+
+      except ApiException as e:
+        print(f'Exception when listing Kubernetes nodes or pods: {e}')
+
+
+    if __name__ == '__main__':
+      run_scheduling_loop()
+  label-nodes-daemon.py: |
+    #!/usr/bin/env python
+
+    # Copyright 2024 Google Inc. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+
+    import time
+
+    from kubernetes import client
+    from kubernetes import config
+    import requests
+
+
+    def update_node_labels(kube):
+      """Updates Kubernetes node labels based on GCE VM metadata."""
+      node_name_url = "http://metadata.google.internal/computeMetadata/v1/instance/name"
+      metadata_url = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host"
+      headers = {"Metadata-Flavor": "Google"}
+
+      response = requests.get(node_name_url, headers=headers)
+
+      if response.status_code == 200:
+        node_name = response.text
+      else:
+        print("Node name not found")
+        return
+
+      response = requests.get(metadata_url, headers=headers)
+
+      if response.status_code == 200:
+        physical_host = response.text
+      else:
+        print("physical host not found")
+        return
+
+      cluster, rack, host = physical_host.split("/")[1:]
+
+      node_labels = {
+          "topology.gke.io/cluster": cluster,
+          "topology.gke.io/rack": rack,
+          "topology.gke.io/host": host,
+      }
+
+      kube.patch_node(node_name, {"metadata": {"labels": node_labels}})
+      print(f"Updated labels on node {node_name}: {node_labels}")
+
+
+    if __name__ == "__main__":
+      # Kubernetes configuration
+      config.load_incluster_config()
+      kube = client.CoreV1Api()
+
+      while True:
+        print("Starting node update")
+        # Update node labels
+        update_node_labels(kube)
+        time.sleep(600)
diff --git a/community/modules/compute/gke-topology-scheduler/variables.tf b/community/modules/compute/gke-topology-scheduler/variables.tf
new file mode 100644
index 0000000000..0766091223
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/variables.tf
@@ -0,0 +1,23 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "project_id" {
+  description = "The project ID to host the cluster in."
+  type        = string
+}
+
+variable "cluster_id" {
+  description = "projects/{{project}}/locations/{{location}}/clusters/{{cluster}}"
+  type        = string
+}
diff --git a/community/modules/compute/gke-topology-scheduler/versions.tf b/community/modules/compute/gke-topology-scheduler/versions.tf
new file mode 100644
index 0000000000..6c94438518
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/versions.tf
@@ -0,0 +1,21 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_version = ">= 1.3"
+
+  provider_meta "google" {
+    module_name = "blueprints/terraform/hpc-toolkit:gke-topology-scheduler/v1.39.0"
+  }
+}

From 7595cb0232c75461df2416f58e3ffa9ec07d7dc4 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 27 Sep 2024 14:08:26 +0000
Subject: [PATCH 019/102] metadata file added

---
 .../compute/gke-topology-scheduler/README.md  |  2 +-
 .../gke-topology-scheduler/metadata.yaml      | 19 +++++++++++++++++++
 .../gke-topology-scheduler/versions.tf        |  4 ----
 3 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 community/modules/compute/gke-topology-scheduler/metadata.yaml

diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md
index ad4ea32cbd..5aaa4fca98 100644
--- a/community/modules/compute/gke-topology-scheduler/README.md
+++ b/community/modules/compute/gke-topology-scheduler/README.md
@@ -15,7 +15,7 @@ should be present for each GPU node in the cluster.
 The following example installs topology scheduler on a GKE cluster.
 
 ```yaml
-  - id: topology_aware_scheduler_install
+- id: topology_aware_scheduler_install
     source: community/modules/compute/gke-topology-scheduler
     use: [gke_cluster]
 ```
diff --git a/community/modules/compute/gke-topology-scheduler/metadata.yaml b/community/modules/compute/gke-topology-scheduler/metadata.yaml
new file mode 100644
index 0000000000..17bedb471b
--- /dev/null
+++ b/community/modules/compute/gke-topology-scheduler/metadata.yaml
@@ -0,0 +1,19 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+spec:
+  requirements:
+    services:
+    - container.googleapis.com
diff --git a/community/modules/compute/gke-topology-scheduler/versions.tf b/community/modules/compute/gke-topology-scheduler/versions.tf
index 6c94438518..adcbea8ca2 100644
--- a/community/modules/compute/gke-topology-scheduler/versions.tf
+++ b/community/modules/compute/gke-topology-scheduler/versions.tf
@@ -14,8 +14,4 @@
 
 terraform {
   required_version = ">= 1.3"
-
-  provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-topology-scheduler/v1.39.0"
-  }
 }

From 6ee205bf48ec5a23c62af2349ee15818e2b22046 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= <wiktorn@google.com>
Date: Mon, 30 Sep 2024 17:16:28 +0000
Subject: [PATCH 020/102] Ensure enough open files limit for Parallelstore

---
 modules/file-system/parallelstore/scripts/mount-daos.sh        | 3 +++
 .../pre-existing-network-storage/scripts/mount-daos.sh         | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh
index e2500c93a5..2b09f2e6d4 100644
--- a/modules/file-system/parallelstore/scripts/mount-daos.sh
+++ b/modules/file-system/parallelstore/scripts/mount-daos.sh
@@ -65,6 +65,9 @@ chmod 777 "$local_mount"
 fuse_config=/etc/fuse.conf
 sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config
 
+# make sure limit of open files is high enough for dfuse (1M of open files)
+ulimit -n 1048576
+
 for i in {1..10}; do
 	# To parse mount_options as --disable-wb-cache --eq-count=8.
 	# shellcheck disable=SC2086
diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
index e2500c93a5..2b09f2e6d4 100644
--- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
+++ b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
@@ -65,6 +65,9 @@ chmod 777 "$local_mount"
 fuse_config=/etc/fuse.conf
 sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config
 
+# make sure limit of open files is high enough for dfuse (1M of open files)
+ulimit -n 1048576
+
 for i in {1..10}; do
 	# To parse mount_options as --disable-wb-cache --eq-count=8.
 	# shellcheck disable=SC2086

From ed78494d00bfd1561ef25c45a07ae8bdb0b6442a Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Mon, 30 Sep 2024 19:13:42 +0000
Subject: [PATCH 021/102] inlude nccl test instruction in output for sample
 workload

---
 modules/compute/gke-node-pool/README.md  | 22 +++++++++++-----------
 modules/compute/gke-node-pool/outputs.tf |  8 ++++++++
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index fcf7414af6..78c4dd1dd7 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -294,26 +294,26 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 8be6a2772a..58216e957f 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -80,6 +80,14 @@ locals {
 
     You can use the following commands to submit the sample job:
       kubectl create -f ${abspath(local.gpu_direct_setting.updated_workload_path)}
+    After submitting the sample job, you can validate the GPU performance by initiating NCCL test included in the sample workload:
+      NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods:
+      export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1)
+      export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}')
+      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS
+    Depends on the Msg size used for transmission in the test, the busbw would different a bit.
+    For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s
+    For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s
 
     If you would like to enable GPUDirect for your own workload, please follow the below steps:
       export WORKLOAD_PATH=<>

From fcd6eb3348fb6dde411dd3a080eff9bf5e28eb54 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Mon, 30 Sep 2024 17:29:58 -0500
Subject: [PATCH 022/102] Move a3-megagpu-8g tests to us-west4-a due to
 available capacity for testing

---
 .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml  | 1 +
 tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml | 4 ++--
 .../daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml         | 3 +--
 .../daily-tests/tests/ml-a3-megagpu-slurm-image.yml           | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml
index 5b39c04792..8d46b10c40 100644
--- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml
@@ -20,6 +20,7 @@ blueprint_name: a3mega-cluster
 vars:
   deployment_name: a3mega-cluster
   a3mega_partition_name: a3mega
+  a3mega_maintenance_interval: ""
   enable_placement: false
   remote_mount_homefs: /nfsshare
   local_mount_homefs: /home
diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml
index e066cbff27..f24fb0ffd5 100644
--- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml
+++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml
@@ -70,8 +70,8 @@ steps:
         cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet
     }
 
-    REGION=australia-southeast1
-    ZONE=australia-southeast1-c
+    REGION=us-west4
+    ZONE=us-west4-a
 
     trap 'destroy_on_exit' EXIT
     ./gcluster deploy \
diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml
index 00cfcf0c76..ef60518e9f 100644
--- a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml
+++ b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml
@@ -46,6 +46,5 @@ cli_deployment_vars:
   a3mega_cluster_size: 2
   enable_ops_agent: "true"
   enable_nvidia_dcgm: "true"
-  a3mega_reservation_name: a3mega-reservation-australia-southeast1-c
-  a3mega_maintenance_interval: PERIODIC
+  a3mega_reservation_name: a3mega-reservation-0
   final_image_family: "{{ final_image_family }}"
diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml
index 4c5c9175e5..d4c4e31c18 100644
--- a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml
+++ b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml
@@ -24,8 +24,8 @@ delete_image: false
 cli_deployment_vars:
   network_name_system: default
   subnetwork_name_system: default
-  region: us-west1
-  zone: us-west1-a
+  region: us-west4
+  zone: us-west4-a
   enable_ops_agent: true
   enable_nvidia_dcgm: true
   slurm_cluster_name: a3mc{{ build[0:4] }}

From ab41e09cba54d3055a29a354629a3a594d582e4b Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Sat, 28 Sep 2024 00:58:09 +0000
Subject: [PATCH 023/102] Add clean up for TPUs

---
 .../schedmd-slurm-gcp-v6-controller/README.md |  2 +-
 .../modules/cleanup_tpu/README.md             | 79 +++++++++++++++++++
 .../modules/cleanup_tpu/main.tf               | 32 ++++++++
 .../cleanup_tpu/scripts/cleanup_tpu.sh        | 63 +++++++++++++++
 .../modules/cleanup_tpu/variables.tf          | 60 ++++++++++++++
 .../modules/cleanup_tpu/versions.tf           | 27 +++++++
 .../partition.tf                              | 15 ++--
 7 files changed, 272 insertions(+), 6 deletions(-)
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf
 create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/versions.tf

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index b9cb9d6d95..40d24732a6 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -237,7 +237,7 @@ limitations under the License.
 | <a name="module_bucket"></a> [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 |
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
-| <a name="module_nodeset_tpu_cleanup"></a> [nodeset\_tpu\_cleanup](#module\_nodeset\_tpu\_cleanup) | ./modules/cleanup_compute | n/a |
+| <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
 | <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 |
 | <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 |
 | <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md
new file mode 100644
index 0000000000..61a08c700f
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md
@@ -0,0 +1,79 @@
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
+| <a name="requirement_null"></a> [null](#requirement\_null) | >= 3.0 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_null"></a> [null](#provider\_null) | 3.2.3 |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_enable_cleanup_compute"></a> [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of TPU nodes managed by this module, when cluster is destroyed.<br/><br/>*WARNING*: Toggling this off will impact the running workload.<br/>Deployed TPU nodes will be destroyed. | `bool` | n/a | yes |
+| <a name="input_endpoint_versions"></a> [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) | <pre>object({<br/>    compute = string<br/>  })</pre> | n/a | yes |
+| <a name="input_gcloud_path_override"></a> [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes |
+| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Nodeset to cleanup | <pre>object({<br/>    nodeset_name = string<br/>    zone         = string<br/>  })</pre> | n/a | yes |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes |
+| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes |
+| <a name="input_universe_domain"></a> [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes |
+
+## Outputs
+
+No outputs.
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
+| <a name="requirement_null"></a> [null](#requirement\_null) | >= 3.0 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_null"></a> [null](#provider\_null) | >= 3.0 |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_enable_cleanup_compute"></a> [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of TPU nodes managed by this module, when cluster is destroyed.<br/><br/>*WARNING*: Toggling this off will impact the running workload.<br/>Deployed TPU nodes will be destroyed. | `bool` | n/a | yes |
+| <a name="input_endpoint_versions"></a> [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) | <pre>object({<br/>    compute = string<br/>  })</pre> | n/a | yes |
+| <a name="input_gcloud_path_override"></a> [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes |
+| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Nodeset to cleanup | <pre>object({<br/>    nodeset_name = string<br/>    zone         = string<br/>  })</pre> | n/a | yes |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes |
+| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes |
+| <a name="input_universe_domain"></a> [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes |
+
+## Outputs
+
+No outputs.
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf
new file mode 100644
index 0000000000..ec86a03a24
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf
@@ -0,0 +1,32 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+resource "null_resource" "script" {
+  count = var.enable_cleanup_compute ? 1 : 0
+
+  triggers = {
+    project_id               = var.project_id
+    cluster_name             = var.slurm_cluster_name
+    nodeset_name             = var.nodeset.nodeset_name
+    zone                     = var.nodeset.zone
+    universe_domain          = var.universe_domain
+    compute_endpoint_version = var.endpoint_versions.compute
+    gcloud_path_override     = var.gcloud_path_override
+  }
+
+  provisioner "local-exec" {
+    command = "/bin/bash ${path.module}/scripts/cleanup_tpu.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.zone} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}"
+    when    = destroy
+  }
+}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh
new file mode 100755
index 0000000000..c724e342c3
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e -o pipefail
+
+project="$1"
+cluster_name="$2"
+nodeset_name="$3"
+zone="$4"
+universe_domain="$5"
+compute_endpoint_version="$6"
+gcloud_dir="$7"
+
+if [[ $# -ne 6 ]] && [[ $# -ne 7 ]]; then
+	echo "Usage: $0 <project> <cluster_name> <nodeset_name> <zone> <universe_domain> <compute_endpoint_version> [<gcloud_dir>]"
+	exit 1
+fi
+
+if [[ -n "${gcloud_dir}" ]]; then
+	export PATH="$gcloud_dir:$PATH"
+fi
+
+export CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE="https://www.${universe_domain}/compute/${compute_endpoint_version}/"
+export CLOUDSDK_CORE_PROJECT="${project}"
+
+if ! type -P gcloud 1>/dev/null; then
+	echo "gcloud is not available and your compute resources are not being cleaned up"
+	echo "https://console.cloud.google.com/compute/instances?project=${project}"
+	exit 1
+fi
+
+echo "Deleting TPU nodes"
+node_filter="name~${cluster_name}-${nodeset_name}"
+running_nodes_filter="${node_filter} AND state!=DELETING"
+
+# List all currently running nodes and attempt to delete them
+gcloud compute tpus tpu-vm list --zone="${zone}" --format="value(name)" --filter="${running_nodes_filter}" | while read -r name; do
+	echo "Deleting TPU node: $name"
+	gcloud compute tpus tpu-vm delete --async --zone="${zone}" --quiet "${name}" || echo "Failed to delete $name"
+done
+
+# Wait until nodes in DELETING state are deleted, before deleting the resource policies
+deleting_nodes_filter="${node_filter} AND state=DELETING"
+while true; do
+	node=$(gcloud compute tpus tpu-vm list --zone="${zone}" --format="value(name)" --filter="${deleting_nodes_filter}" --limit=1)
+	if [[ -z "${node}" ]]; then
+		break
+	fi
+	echo "Waiting for nodes to be deleted: ${node}"
+	sleep 5
+done
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf
new file mode 100644
index 0000000000..1ac6f64b75
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf
@@ -0,0 +1,60 @@
+/**
+ * Copyright (C) Google LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "project_id" {
+  type        = string
+  description = "Project ID"
+}
+
+variable "slurm_cluster_name" {
+  type        = string
+  description = "Name of the Slurm cluster"
+}
+
+variable "enable_cleanup_compute" {
+  description = <<EOD
+Enables automatic cleanup of TPU nodes managed by this module, when cluster is destroyed.
+
+*WARNING*: Toggling this off will impact the running workload.
+Deployed TPU nodes will be destroyed.
+EOD
+  type        = bool
+}
+
+variable "universe_domain" {
+  description = "Domain address for alternate API universe"
+  type        = string
+}
+
+variable "endpoint_versions" {
+  description = "Version of the API to use (The compute service is the only API currently supported)"
+  type = object({
+    compute = string
+  })
+}
+
+variable "gcloud_path_override" {
+  description = "Directory of the gcloud executable to be used during cleanup"
+  type        = string
+}
+
+variable "nodeset" {
+  description = "Nodeset to cleanup"
+  type = object({
+    nodeset_name = string
+    zone         = string
+  })
+}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/versions.tf
new file mode 100644
index 0000000000..a50d6d7c2f
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/versions.tf
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+terraform {
+  required_version = ">= 1.3"
+
+  required_providers {
+    null = {
+      source  = "hashicorp/null"
+      version = ">= 3.0"
+    }
+
+  }
+}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index a2e3990ade..753fe6512c 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -121,14 +121,13 @@ module "slurm_nodeset_tpu" {
   subnetwork             = each.value.subnetwork
 }
 
-module "nodeset_tpu_cleanup" {
-  source   = "./modules/cleanup_compute"
+module "nodeset_cleanup_tpu" {
+  source   = "./modules/cleanup_tpu"
   for_each = local.nodeset_tpu_map
 
   nodeset = {
-    nodeset_name         = each.value.nodeset_name
-    subnetwork_self_link = each.value.subnetwork
-    additional_networks  = []
+    nodeset_name = each.value.nodeset_name
+    zone         = each.value.zone
   }
 
   project_id             = var.project_id
@@ -137,4 +136,10 @@ module "nodeset_tpu_cleanup" {
   universe_domain        = var.universe_domain
   endpoint_versions      = var.endpoint_versions
   gcloud_path_override   = var.gcloud_path_override
+
+  depends_on = [
+    # Depend on controller network, as a best effort to avoid
+    # subnetwork resourceInUseByAnotherResource error
+    var.subnetwork_self_link
+  ]
 }

From 7116f791818773a8c89baa384bd44b8a1fcfe459 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 19 Sep 2024 20:14:43 +0000
Subject: [PATCH 024/102] Move to SlurmGCP image 6.7

---
 community/examples/AMD/hpc-amd-slurm.yaml          |  2 +-
 community/examples/hpc-slurm-ubuntu2004.yaml       |  2 +-
 community/examples/hpc-slurm6-apptainer.yaml       |  2 +-
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md |  4 ++--
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf   |  2 +-
 .../source_image_logic.tf                          |  8 ++++----
 .../variables.tf                                   |  2 +-
 .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md     |  2 +-
 .../schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf  |  2 +-
 .../compute/schedmd-slurm-gcp-v6-nodeset/README.md |  2 +-
 .../source_image_logic.tf                          |  8 ++++----
 .../schedmd-slurm-gcp-v6-nodeset/variables.tf      |  2 +-
 .../schedmd-slurm-gcp-v6-controller/README.md      | 14 +++++++-------
 .../schedmd-slurm-gcp-v6-controller/controller.tf  |  4 ++--
 .../schedmd-slurm-gcp-v6-controller/login.tf       |  4 ++--
 .../schedmd-slurm-gcp-v6-controller/partition.tf   |  4 ++--
 .../source_image_logic.tf                          |  8 ++++----
 .../variables_controller_instance.tf               |  2 +-
 .../scheduler/schedmd-slurm-gcp-v6-login/README.md |  2 +-
 .../source_image_logic.tf                          |  8 ++++----
 .../schedmd-slurm-gcp-v6-login/variables.tf        |  2 +-
 examples/cae/cae-slurm.yaml                        |  2 +-
 examples/hpc-enterprise-slurm.yaml                 |  2 +-
 examples/hpc-slurm-static.yaml                     |  2 +-
 examples/image-builder.yaml                        |  2 +-
 .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml         |  2 +-
 .../a3-megagpu-8g/slurm-a3mega-image.yaml          |  2 +-
 examples/ml-slurm.yaml                             |  2 +-
 .../daily-tests/blueprints/lustre-slurm.yaml       |  4 ++--
 .../daily-tests/tests/slurm-v6-debian.yml          |  2 +-
 .../golden_copies/configs/versioned_blueprint.yaml |  2 +-
 .../.ghpc/artifacts/expanded_blueprint.yaml        |  2 +-
 .../versioned_blueprint/primary/terraform.tfvars   |  2 +-
 .../validate_configs/test_configs/node-groups.yaml |  6 +++---
 34 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml
index 5decf96a2d..282d5b7816 100644
--- a/community/examples/AMD/hpc-amd-slurm.yaml
+++ b/community/examples/AMD/hpc-amd-slurm.yaml
@@ -168,7 +168,7 @@ deployment_groups:
         # these images must match the images used by Slurm modules below because
         # we are building OpenMPI with PMI support in libraries contained in
         # Slurm installation
-        family: slurm-gcp-6-6-hpc-rocky-linux-8
+        family: slurm-gcp-6-7-hpc-rocky-linux-8
         project: schedmd-slurm-public
 
   - id: low_cost_nodeset
diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml
index ed3a587fb9..7e89520c05 100644
--- a/community/examples/hpc-slurm-ubuntu2004.yaml
+++ b/community/examples/hpc-slurm-ubuntu2004.yaml
@@ -24,7 +24,7 @@ vars:
   slurm_image:
     # Please refer to the following link for the latest images:
     # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
-    family: slurm-gcp-6-6-ubuntu-2004-lts
+    family: slurm-gcp-6-7-ubuntu-2004-lts
     project: schedmd-slurm-public
   instance_image_custom: true
 
diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml
index 6848b1b4f0..47e9c267aa 100644
--- a/community/examples/hpc-slurm6-apptainer.yaml
+++ b/community/examples/hpc-slurm6-apptainer.yaml
@@ -60,7 +60,7 @@ deployment_groups:
     settings:
       source_image_project_id: [schedmd-slurm-public]
       # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8
+      source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8
       # You can find size of source image by using following command
       # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
       disk_size: $(vars.disk_size)
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
index 86fcc2e9e7..72f4fccb9f 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
@@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
 
 ## Resources
 
@@ -104,7 +104,7 @@ modules. For support with the underlying modules, see the instructions in the
 | <a name="input_enable_spot_vm"></a> [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no |
 | <a name="input_feature"></a> [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-6-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
+| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-7-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
 | <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
index 2314471ac9..7e547c3d5f 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
@@ -56,7 +56,7 @@ locals {
 }
 
 module "slurm_nodeset_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
 
   project_id          = var.project_id
   region              = var.region
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf
index 57e909b9a5..a86c28ffc2 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf
@@ -18,10 +18,10 @@ locals {
   # Currently supported images and projects
   known_project_families = {
     schedmd-slurm-public = [
-      "slurm-gcp-6-6-debian-11",
-      "slurm-gcp-6-6-hpc-rocky-linux-8",
-      "slurm-gcp-6-6-ubuntu-2004-lts",
-      "slurm-gcp-6-6-ubuntu-2204-lts-arm64"
+      "slurm-gcp-6-7-debian-11",
+      "slurm-gcp-6-7-hpc-rocky-linux-8",
+      "slurm-gcp-6-7-ubuntu-2004-lts",
+      "slurm-gcp-6-7-ubuntu-2204-lts-arm64"
     ]
   }
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf
index 3c8b0743dd..5d5f71c9c0 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf
@@ -68,7 +68,7 @@ variable "instance_image" {
     EOD
   type        = map(string)
   default = {
-    family  = "slurm-gcp-6-6-hpc-rocky-linux-8"
+    family  = "slurm-gcp-6-7-hpc-rocky-linux-8"
     project = "schedmd-slurm-public"
   }
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
index 14d945c9a5..fac8a63d44 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
@@ -59,7 +59,7 @@ No resources.
 | <a name="input_accelerator_config"></a> [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. | <pre>object({<br/>    topology = string<br/>    version  = string<br/>  })</pre> | <pre>{<br/>  "topology": "",<br/>  "version": ""<br/>}</pre> | no |
 | <a name="input_data_disks"></a> [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no |
 | <a name="input_disable_public_ips"></a> [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no |
-| <a name="input_docker_image"></a> [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf-<var.tf\_version> | `string` | `null` | no |
+| <a name="input_docker_image"></a> [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-<var.tf\_version> | `string` | `null` | no |
 | <a name="input_enable_public_ips"></a> [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set. <br/>If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. | <pre>list(object({<br/>    server_ip     = string,<br/>    remote_mount  = string,<br/>    local_mount   = string,<br/>    fs_type       = string,<br/>    mount_options = string,<br/>  }))</pre> | `[]` | no |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
index 3761707b3e..30e8d5c177 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
@@ -103,7 +103,7 @@ variable "data_disks" {
 }
 
 variable "docker_image" {
-  description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf-<var.tf_version>"
+  description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-<var.tf_version>"
   type        = string
   default     = null
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
index 5eb8ba6665..117e0ca0e5 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
@@ -178,7 +178,7 @@ No modules.
 | <a name="input_enable_smt"></a> [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no |
 | <a name="input_enable_spot_vm"></a> [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-6-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
+| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-7-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
 | <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
 | <a name="input_instance_properties"></a> [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,<br/>recommended for advanced usage only.<br/>See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert<br/>If any sub-field (e.g. scheduling) is set, it will override the values computed by<br/>SlurmGCP and ignoring values of provided vars. | `any` | `null` | no |
 | <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf
index 57e909b9a5..a86c28ffc2 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf
@@ -18,10 +18,10 @@ locals {
   # Currently supported images and projects
   known_project_families = {
     schedmd-slurm-public = [
-      "slurm-gcp-6-6-debian-11",
-      "slurm-gcp-6-6-hpc-rocky-linux-8",
-      "slurm-gcp-6-6-ubuntu-2004-lts",
-      "slurm-gcp-6-6-ubuntu-2204-lts-arm64"
+      "slurm-gcp-6-7-debian-11",
+      "slurm-gcp-6-7-hpc-rocky-linux-8",
+      "slurm-gcp-6-7-ubuntu-2004-lts",
+      "slurm-gcp-6-7-ubuntu-2204-lts-arm64"
     ]
   }
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
index 9609725952..aeb2435bd0 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
@@ -88,7 +88,7 @@ variable "instance_image" {
     EOD
   type        = map(string)
   default = {
-    family  = "slurm-gcp-6-6-hpc-rocky-linux-8"
+    family  = "slurm-gcp-6-7-hpc-rocky-linux-8"
     project = "schedmd-slurm-public"
   }
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index 40d24732a6..30f002d68f 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -238,13 +238,13 @@ limitations under the License.
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
 | <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
-| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 |
-| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 |
+| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 |
+| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
 | <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
-| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 |
-| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 |
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 |
-| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.7.0 |
+| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 |
+| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
+| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.0 |
 
 ## Resources
 
@@ -301,7 +301,7 @@ limitations under the License.
 | <a name="input_extra_logging_flags"></a> [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no |
 | <a name="input_gcloud_path_override"></a> [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-6-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
+| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-7-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
 | <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
 | <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
index 49bc366f21..0148323597 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
@@ -43,7 +43,7 @@ locals {
 
 # INSTANCE TEMPLATE
 module "slurm_controller_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
 
   project_id          = var.project_id
   region              = var.region
@@ -99,7 +99,7 @@ locals {
 }
 
 module "slurm_controller_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.7.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0"
 
   access_config       = var.enable_controller_public_ips ? [local.access_config] : []
   add_hostname_suffix = false
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
index ab1123ad19..d9cb38ff07 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
@@ -14,7 +14,7 @@
 
 # TEMPLATE
 module "slurm_login_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
 
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
@@ -56,7 +56,7 @@ module "slurm_login_template" {
 
 # INSTANCE
 module "slurm_login_instance" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.7.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0"
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
   access_config       = each.value.access_config
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index 753fe6512c..9be62f82f7 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -26,7 +26,7 @@ locals {
 # NODESET
 # TODO: remove dependency on slurm-gcp repo, move to local template module
 module "slurm_nodeset_template" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
   for_each = local.nodeset_map
 
   project_id          = var.project_id
@@ -101,7 +101,7 @@ locals {
 
 # NODESET TPU
 module "slurm_nodeset_tpu" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.7.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.0"
   for_each = local.nodeset_tpu_map
 
   project_id             = var.project_id
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf
index 57e909b9a5..a86c28ffc2 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf
@@ -18,10 +18,10 @@ locals {
   # Currently supported images and projects
   known_project_families = {
     schedmd-slurm-public = [
-      "slurm-gcp-6-6-debian-11",
-      "slurm-gcp-6-6-hpc-rocky-linux-8",
-      "slurm-gcp-6-6-ubuntu-2004-lts",
-      "slurm-gcp-6-6-ubuntu-2204-lts-arm64"
+      "slurm-gcp-6-7-debian-11",
+      "slurm-gcp-6-7-hpc-rocky-linux-8",
+      "slurm-gcp-6-7-ubuntu-2004-lts",
+      "slurm-gcp-6-7-ubuntu-2204-lts-arm64"
     ]
   }
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf
index 69eea81844..0df835e322 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf
@@ -267,7 +267,7 @@ variable "instance_image" {
     EOD
   type        = map(string)
   default = {
-    family  = "slurm-gcp-6-6-hpc-rocky-linux-8"
+    family  = "slurm-gcp-6-7-hpc-rocky-linux-8"
     project = "schedmd-slurm-public"
   }
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
index ee6fd367c6..0afd0bfee7 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
@@ -100,7 +100,7 @@ No modules.
 | <a name="input_enable_shielded_vm"></a> [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
 | <a name="input_enable_smt"></a> [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = string,<br/>    count = number<br/>  }))</pre> | `[]` | no |
-| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-6-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
+| <a name="input_instance_image"></a> [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.<br/><br/>Expected Fields:<br/>name: The name of the image. Mutually exclusive with family.<br/>family: The image family to use. Mutually exclusive with name.<br/>project: The project where the image is hosted.<br/><br/>For more information on creating custom images that comply with Slurm on GCP<br/>see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` | <pre>{<br/>  "family": "slurm-gcp-6-7-hpc-rocky-linux-8",<br/>  "project": "schedmd-slurm-public"<br/>}</pre> | no |
 | <a name="input_instance_image_custom"></a> [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting<br/>to use a custom and potentially incompatible image for this Slurm on<br/>GCP module.<br/><br/>If the field is set to false, only the compatible families and project<br/>names will be accepted.  The deployment will fail with any other image<br/>family or name.  If set to true, no checks will be done.<br/><br/>See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no |
 | <a name="input_instance_template"></a> [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf
index 57e909b9a5..a86c28ffc2 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf
@@ -18,10 +18,10 @@ locals {
   # Currently supported images and projects
   known_project_families = {
     schedmd-slurm-public = [
-      "slurm-gcp-6-6-debian-11",
-      "slurm-gcp-6-6-hpc-rocky-linux-8",
-      "slurm-gcp-6-6-ubuntu-2004-lts",
-      "slurm-gcp-6-6-ubuntu-2204-lts-arm64"
+      "slurm-gcp-6-7-debian-11",
+      "slurm-gcp-6-7-hpc-rocky-linux-8",
+      "slurm-gcp-6-7-ubuntu-2004-lts",
+      "slurm-gcp-6-7-ubuntu-2204-lts-arm64"
     ]
   }
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf
index f7d4cacd85..2b53c8f9e5 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf
@@ -325,7 +325,7 @@ variable "instance_image" {
     EOD
   type        = map(string)
   default = {
-    family  = "slurm-gcp-6-6-hpc-rocky-linux-8"
+    family  = "slurm-gcp-6-7-hpc-rocky-linux-8"
     project = "schedmd-slurm-public"
   }
 
diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml
index a3e9820ab9..34096a7080 100644
--- a/examples/cae/cae-slurm.yaml
+++ b/examples/cae/cae-slurm.yaml
@@ -40,7 +40,7 @@ vars:
   # for a list of valid family options with Slurm; note: the image types for the compute nodes
   # and the Chrome Remote Desktop (CRD) need to have the same Slurm base.
   instance_image:
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
 
 # Documentation for each of the modules used below can be found at
diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml
index 3ef0ba990f..d7520d3b85 100644
--- a/examples/hpc-enterprise-slurm.yaml
+++ b/examples/hpc-enterprise-slurm.yaml
@@ -25,7 +25,7 @@ vars:
   slurm_image:
     # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
     # for a list of valid family options with Slurm
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
   # If image above is changed to use custom image, then setting below must be set to true
   instance_image_custom: false
diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml
index fff15e07dc..07ed2a4690 100644
--- a/examples/hpc-slurm-static.yaml
+++ b/examples/hpc-slurm-static.yaml
@@ -29,7 +29,7 @@ vars:
   static_node_count: 2 ## Must be <= number of reserved machines ##
 
   slurm_instance_image:
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
   instance_image_custom: false  # true if using custom image in lines above
   bandwidth_tier: gvnic_enabled
diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml
index 63f5d89fbd..715948b0dd 100644
--- a/examples/image-builder.yaml
+++ b/examples/image-builder.yaml
@@ -59,7 +59,7 @@ deployment_groups:
     settings:
       source_image_project_id: [schedmd-slurm-public]
       # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8
+      source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8
       # You can find size of source image by using following command
       # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
       disk_size: $(vars.disk_size)
diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
index 6540c18954..c50454739e 100644
--- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
+++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
@@ -94,7 +94,7 @@ deployment_groups:
           set -e -o pipefail
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.7.0 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml
diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
index b3c44273fa..6ba58f0308 100644
--- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
@@ -108,7 +108,7 @@ deployment_groups:
           apt-get install -y git
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.7.0 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml
diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml
index 81a78b59a1..4baaaf07ce 100644
--- a/examples/ml-slurm.yaml
+++ b/examples/ml-slurm.yaml
@@ -139,7 +139,7 @@ deployment_groups:
       omit_external_ip: false
       source_image_project_id: [schedmd-slurm-public]
       # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-6-6-debian-11
+      source_image_family: slurm-gcp-6-7-debian-11
       # You can find size of source image by using following command
       # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
       disk_size: $(vars.disk_size_gb)
diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml
index 2ee69cf821..44900430a7 100644
--- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml
@@ -27,7 +27,7 @@ vars:
   # on_host_maintenance: MIGRATE
   num_nodes: 1
   rocky_image:
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
 
 deployment_groups:
@@ -79,7 +79,7 @@ deployment_groups:
   #   settings:
   #     node_count_dynamic_max: $(vars.num_nodes)
   #     instance_image:
-  #       family: slurm-gcp-6-6-ubuntu-2004-lts
+  #       family: slurm-gcp-6-7-ubuntu-2004-lts
   #       project: schedmd-slurm-public
 
   # - id: ubuntu_partition
diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml
index 77bbea5edc..8d5e724b0b 100644
--- a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml
+++ b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml
@@ -22,7 +22,7 @@ slurm_cluster_name: "debiv6{{ build[0:4] }}"
 
 cli_deployment_vars:
   network_name: "{{ network }}"
-  slurm_image: "{family: slurm-gcp-6-6-debian-11, project: schedmd-slurm-public}"
+  slurm_image: "{family: slurm-gcp-6-7-debian-11, project: schedmd-slurm-public}"
   region: us-west4
   zone: us-west4-c
 
diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
index db6c920704..6344dd8d76 100644
--- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
@@ -27,7 +27,7 @@ vars:
   slurm_image:
     # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
     # for a list of valid family options with Slurm
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
   # If image above is changed to use custom image, then setting below must be set to true
   instance_image_custom: false
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
index cff3ce442f..ba7ec541b3 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
@@ -39,7 +39,7 @@ vars:
   project_id: invalid-project
   region: us-central1
   slurm_image:
-    family: slurm-gcp-6-6-hpc-rocky-linux-8
+    family: slurm-gcp-6-7-hpc-rocky-linux-8
     project: schedmd-slurm-public
   zone: us-central1-a
 deployment_groups:
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars
index 0e31c36a07..39fad882b4 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars
@@ -30,7 +30,7 @@ project_id = "invalid-project"
 region = "us-central1"
 
 slurm_image = {
-  family  = "slurm-gcp-6-6-hpc-rocky-linux-8"
+  family  = "slurm-gcp-6-7-hpc-rocky-linux-8"
   project = "schedmd-slurm-public"
 }
 
diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml
index ca6a7b9c89..cfb166cbb5 100644
--- a/tools/validate_configs/test_configs/node-groups.yaml
+++ b/tools/validate_configs/test_configs/node-groups.yaml
@@ -64,7 +64,7 @@ deployment_groups:
       name: c30
       machine_type: c2-standard-30
       instance_image:
-        family: slurm-gcp-6-6-debian-11
+        family: slurm-gcp-6-7-debian-11
         project: schedmd-slurm-public
       instance_image_custom: true
 
@@ -75,7 +75,7 @@ deployment_groups:
       name: c60
       machine_type: c2-standard-60
       instance_image:
-        family: slurm-gcp-6-6-hpc-rocky-linux-8
+        family: slurm-gcp-6-7-hpc-rocky-linux-8
         project: schedmd-slurm-public
 
   - id: nodeset_3
@@ -85,7 +85,7 @@ deployment_groups:
       name: cd112
       machine_type: c2d-standard-112
       instance_image:
-        family: slurm-gcp-6-6-hpc-rocky-linux-8
+        family: slurm-gcp-6-7-hpc-rocky-linux-8
         project: schedmd-slurm-public
       instance_image_custom: true
       enable_smt: true

From f06758ab02732d022bfe57dea81b930dd88b38e4 Mon Sep 17 00:00:00 2001
From: ChengcongDu <chdu@google.com>
Date: Tue, 1 Oct 2024 16:25:54 +0000
Subject: [PATCH 025/102] remove expected performanc note

---
 modules/compute/gke-node-pool/outputs.tf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 58216e957f..7bcd0c6361 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -85,9 +85,6 @@ locals {
       export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1)
       export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}')
       kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS
-    Depends on the Msg size used for transmission in the test, the busbw would different a bit.
-    For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s
-    For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s
 
     If you would like to enable GPUDirect for your own workload, please follow the below steps:
       export WORKLOAD_PATH=<>

From eaf1af7a5d4712e2ba9e712eb97729de39c65b38 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Tue, 1 Oct 2024 22:01:33 -0500
Subject: [PATCH 026/102] Modify triggers that run pre-commit validation

- do not run when user labels the PR (does not change code)
- do not run when PR title/description or base branch is edited
- run when PR is re-opened after having been closed
---
 .github/workflows/pr-precommit.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml
index 37234d2a0e..5b1b5091cf 100644
--- a/.github/workflows/pr-precommit.yml
+++ b/.github/workflows/pr-precommit.yml
@@ -19,9 +19,8 @@ name: 'Use pre-commit to validate Pull Request'
 on:
   pull_request:
     types:
-    - edited
     - opened
-    - labeled
+    - reopened
     - synchronize
     branches:
     - main

From 35cdab35ca4f690172b7f5dbd9377a19bcb26c68 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Tue, 1 Oct 2024 22:06:40 -0500
Subject: [PATCH 027/102] Require labels on pull requests directly to main and
 release-candidate branches

---
 .github/workflows/pr-label-validation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml
index 9fe508fadf..df54a6e150 100644
--- a/.github/workflows/pr-label-validation.yml
+++ b/.github/workflows/pr-label-validation.yml
@@ -28,7 +28,9 @@ on:
     - ready_for_review
     - unlocked
     branches:
+    - main
     - develop
+    - release-candidate
 
 jobs:
   pr-label-validation:

From 69521206fce98b306b64e93adc2853c36add8ec1 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Wed, 2 Oct 2024 09:33:56 -0500
Subject: [PATCH 028/102] Chrome Remote Desktop: update apt cache only if stale

Update the behavior of Ansible to update the apt cache only if it is
stale (more than 1 hour old). In practice, the apt cache is unlikely to
be stale because Ansible was just installed by pip, which requires
several packages to be installed.
---
 .../scripts/configure-chrome-desktop.yml                     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
index 41928f9294..2daea9cd28 100644
--- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
+++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
@@ -16,6 +16,10 @@
 - name: Ensure Desktop OS and Chrome Remote Desktop is installed
   hosts: localhost
   become: true
+  module_defaults:
+    ansible.builtin.apt:
+      update_cache: true
+      cache_valid_time: 3600
   tasks:
   - name: Install desktop packages
     ansible.builtin.apt:
@@ -23,7 +27,6 @@
       - xfce4
       - xfce4-goodies
       state: present
-      update_cache: true
     register: apt_result
     retries: 6
     delay: 10

From 4bb630fef34adb38444b1c8386067827ce0dc01b Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Wed, 2 Oct 2024 09:35:07 -0500
Subject: [PATCH 029/102] Chrome Remote Desktop: increase retry time for apt

We have observed failures of this module when unattended-upgrades is
running simultaneously to the installation of xfce4. This increases the
retry duration from 1 minute to 5 minutes and the number of retries from
6 to 10 (a total of 11 attempts).

When we adopt ansible-core 2.12 or later, we should use the lock_timeout
feature more directly:

https://docs.ansible.com/ansible/latest/collections/ansible/builtin/apt_module.html#parameter-lock_timeout
---
 .../scripts/configure-chrome-desktop.yml                  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
index 2daea9cd28..391aa86433 100644
--- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
+++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml
@@ -28,8 +28,8 @@
       - xfce4-goodies
       state: present
     register: apt_result
-    retries: 6
-    delay: 10
+    retries: 10
+    delay: 30
     until: apt_result is success
 
   - name: Download and configure CRD
@@ -45,8 +45,8 @@
     environment:
       DEBIAN_FRONTEND: noninteractive
     register: apt_result
-    retries: 6
-    delay: 10
+    retries: 10
+    delay: 30
     until: apt_result is success
 
   - name: Configure CRD to use Xfce by default

From 7780f46e729c237e197947b5c6324257734bee20 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Wed, 2 Oct 2024 19:53:33 +0000
Subject: [PATCH 030/102] wordings updated

---
 modules/compute/gke-node-pool/README.md        |  2 +-
 modules/compute/gke-node-pool/gpu_direct.tf    | 18 +++++++++---------
 modules/compute/gke-node-pool/variables.tf     |  4 ++--
 modules/scheduler/gke-cluster/README.md        |  2 +-
 modules/scheduler/gke-cluster/outputs.tf       |  4 ++--
 .../pre-existing-gke-cluster/README.md         |  2 +-
 .../pre-existing-gke-cluster/outputs.tf        |  4 ++--
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 7b1cffbf68..03652cf29e 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -294,7 +294,7 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
-| <a name="input_gke_master_version"></a> [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes |
+| <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 4fef57e914..27c61f0256 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -33,7 +33,7 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml")
       rxdm_version            = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9
       min_additional_networks = 4
-      min_gke_versions = {
+      major_minor_version_acceptable_map = {
         "1.27" = "1.27.7-gke.1121000"
         "1.28" = "1.28.8-gke.1095000"
         "1.29" = "1.29.3-gke.1093000"
@@ -49,7 +49,7 @@ locals {
       updated_workload_path   = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml")
       rxdm_version            = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4
       min_additional_networks = 8
-      min_gke_versions = {
+      major_minor_version_acceptable_map = {
         "1.28" = "1.28.9-gke.1250000"
         "1.29" = "1.29.4-gke.1542000"
         "1.30" = "1.30.4-gke.1129000"
@@ -61,13 +61,13 @@ locals {
 
   gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"]
 
-  gke_version_parts = regex(local.gke_version_regex, var.gke_master_version)
+  gke_version_parts = regex(local.gke_version_regex, var.gke_version)
   gke_version_major = local.gke_version_parts[0]
 
-  min_gke_versions         = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null)
-  min_version              = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0"
-  min_version_parts        = regex(local.gke_version_regex, local.min_version)
-  gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2])
+  major_minor_version_acceptable_map = try(local.gpu_direct_setting[var.machine_type].major_minor_version_acceptable_map, null)
+  minor_version_acceptable           = try(contains(keys(local.major_minor_version_acceptable_map), local.gke_version_major), false) ? local.major_minor_version_acceptable_map[local.gke_version_major] : "1.0.0-gke.0"
+  minor_version_acceptable_parts     = regex(local.gke_version_regex, local.minor_version_acceptable)
+  gke_gpudirect_compatible           = local.gke_version_parts[1] > local.minor_version_acceptable_parts[1] || (local.gke_version_parts[1] == local.minor_version_acceptable_parts[1] && local.gke_version_parts[2] >= local.minor_version_acceptable_parts[2])
 }
 
 check "gpu_direct_check_multi_vpc" {
@@ -77,9 +77,9 @@ check "gpu_direct_check_multi_vpc" {
   }
 }
 
-check "gke_master_version_requirements" {
+check "gke_version_requirements" {
   assert {
     condition     = local.gke_gpudirect_compatible
-    error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
+    error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
   }
 }
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index 62160a2448..b24aef91df 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -361,7 +361,7 @@ variable "initial_node_count" {
   default     = null
 }
 
-variable "gke_master_version" {
-  description = "GKE master version"
+variable "gke_version" {
+  description = "GKE version"
   type        = string
 }
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 4548db2fc9..583af203da 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -194,7 +194,7 @@ limitations under the License.
 |------|-------------|
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. |
-| <a name="output_gke_master_version"></a> [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. |
+| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 | <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. |
 | <a name="output_k8s_service_account_name"></a> [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf
index 4daed8ee25..28e00171ff 100644
--- a/modules/scheduler/gke-cluster/outputs.tf
+++ b/modules/scheduler/gke-cluster/outputs.tf
@@ -75,7 +75,7 @@ output "k8s_service_account_name" {
   value       = one(module.workload_identity[*].k8s_service_account_name)
 }
 
-output "gke_master_version" {
-  description = "GKE cluster's master version."
+output "gke_version" {
+  description = "GKE cluster's version."
   value       = google_container_cluster.gke_cluster.master_version
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md
index 1f2904d889..4caf7ff258 100644
--- a/modules/scheduler/pre-existing-gke-cluster/README.md
+++ b/modules/scheduler/pre-existing-gke-cluster/README.md
@@ -111,5 +111,5 @@ limitations under the License.
 |------|-------------|
 | <a name="output_cluster_id"></a> [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. |
 | <a name="output_gke_cluster_exists"></a> [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. |
-| <a name="output_gke_master_version"></a> [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. |
+| <a name="output_gke_version"></a> [gke\_version](#output\_gke\_version) | GKE cluster's version. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
index 90772d3dae..8884ee30b0 100644
--- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf
@@ -27,7 +27,7 @@ output "gke_cluster_exists" {
   ]
 }
 
-output "gke_master_version" {
-  description = "GKE cluster's master version."
+output "gke_version" {
+  description = "GKE cluster's version."
   value       = data.google_container_cluster.existing_gke_cluster.master_version
 }

From 14864db2a7e9c8587cd8caeb5d288d0ba0266a34 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Wed, 2 Oct 2024 21:45:48 +0000
Subject: [PATCH 031/102] minor wording update

---
 modules/compute/gke-node-pool/gpu_direct.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf
index 27c61f0256..00dd298971 100644
--- a/modules/compute/gke-node-pool/gpu_direct.tf
+++ b/modules/compute/gke-node-pool/gpu_direct.tf
@@ -80,6 +80,6 @@ check "gpu_direct_check_multi_vpc" {
 check "gke_version_requirements" {
   assert {
     condition     = local.gke_gpudirect_compatible
-    error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
+    error_message = "GPUDirect is not supported on GKE version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements"
   }
 }

From 1e9caf1644b0a7bea4e2f44fe1d0f9d0c4ee44af Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 3 Oct 2024 16:26:09 -0500
Subject: [PATCH 032/102] Adopt Google terraform provider plugin 5.44.x

---
 pkg/config/expand.go                                      | 4 ++--
 pkg/config/expand_test.go                                 | 4 ++--
 .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml       | 4 ++--
 .../golden_copies/expectations/igc_pkr/zero/versions.tf   | 4 ++--
 .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml        | 8 ++++----
 .../golden_copies/expectations/igc_tf/one/versions.tf     | 4 ++--
 .../golden_copies/expectations/igc_tf/zero/versions.tf    | 4 ++--
 .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++--
 .../expectations/merge_flatten/zero/versions.tf           | 4 ++--
 .../.ghpc/artifacts/expanded_blueprint.yaml               | 4 ++--
 .../expectations/versioned_blueprint/primary/versions.tf  | 4 ++--
 11 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index 795343a3b1..3a8898306d 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 	return map[string]TerraformProvider{
 		"google": {
 			Source:        "hashicorp/google",
-			Version:       ">= 4.84.0, < 5.39.0",
+			Version:       ">= 4.84.0, < 5.45.0",
 			Configuration: gglConf},
 		"google-beta": {
 			Source:        "hashicorp/google-beta",
-			Version:       ">= 4.84.0, < 5.39.0",
+			Version:       ">= 4.84.0, < 5.45.0",
 			Configuration: gglConf}}
 }
 
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index 6c347aceb9..40fc192175 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		c.Check(g.TerraformProviders, DeepEquals, map[string]PR{
 			"google": TerraformProvider{
 				Source:  "hashicorp/google",
-				Version: ">= 4.84.0, < 5.39.0"},
+				Version: ">= 4.84.0, < 5.45.0"},
 			"google-beta": TerraformProvider{
 				Source:  "hashicorp/google-beta",
-				Version: ">= 4.84.0, < 5.39.0"}})
+				Version: ">= 4.84.0, < 5.45.0"}})
 	}
 
 	{ // no def PR, group PR
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
index efa8f25bfb..32d7d818a8 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
@@ -38,14 +38,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
index 792917c317..6630b9b8c6 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
index b25ddd135b..8a160967a2 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
@@ -44,14 +44,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
@@ -79,14 +79,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
index 792917c317..6630b9b8c6 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
index 792917c317..6630b9b8c6 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
index 71103dd046..9c97a650eb 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
@@ -39,14 +39,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
index 792917c317..6630b9b8c6 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
index ba7ec541b3..4e74f8d305 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
@@ -47,14 +47,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.39.0'
+        version: '>= 4.84.0, < 5.45.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
index 792917c317..6630b9b8c6 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.39.0"
+      version = ">= 4.84.0, < 5.45.0"
     }
   }
 }

From 9cb2ebe84267ae75fd09a267f860b81085b1842a Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 04:54:44 +0000
Subject: [PATCH 033/102] Chunk BigQuery sacct row inserts

---
 .../modules/slurm_files/scripts/load_bq.py    | 18 ++++++---
 .../slurm_files/scripts/tests/test_load_bq.py | 39 +++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 800202d2ea..9967069212 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -19,23 +19,23 @@
 import shelve
 import uuid
 from collections import namedtuple
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from pprint import pprint
 
-from google.cloud.bigquery import SchemaField
+from google.api_core import exceptions, retry
 from google.cloud import bigquery as bq
-from google.api_core import retry, exceptions
+from google.cloud.bigquery import SchemaField
 
 import util
 from util import lookup, run
 
-
 SACCT = "sacct"
 script = Path(__file__).resolve()
 
 DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp"
 timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE))
+BQ_MAX_ROW_LOAD_SIZE = 10000
 
 # cluster_id_file = script.parent / 'cluster_uuid'
 # try:
@@ -321,8 +321,16 @@ def main():
     # on failure, an exception will cause the timestamp not to be rewritten. So
     # it will try again next time. If some writes succeed, we don't currently
     # have a way to not submit duplicates next time.
+    print(f"loading BigQuery data in batches of size : {BQ_MAX_ROW_LOAD_SIZE}")
+    num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1
+    print(f"Number of batches: {num_batches}")
     if jobs:
-        bq_submit(jobs)
+        start_job_idx = 0
+        end_job_idx = BQ_MAX_ROW_LOAD_SIZE
+        for _ in range(num_batches):
+            bq_submit(jobs[start_job_idx:end_job_idx])
+            start_job_idx = end_job_idx
+            end_job_idx += BQ_MAX_ROW_LOAD_SIZE
     write_timestamp(end)
     update_job_idx_cache(jobs, end)
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py
new file mode 100644
index 0000000000..ebe45008a0
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py
@@ -0,0 +1,39 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+# test the chunking logic if not the BigQuery loads themselves
+@pytest.mark.parametrize("num_jobs_to_load", (0, 11, 10001, 51131, 104321))
+def test_chunked_bq_load(num_jobs_to_load: int):
+    BQ_MAX_ROW_LOAD_SIZE = 10000
+    jobs = [i + 1 for i in range(num_jobs_to_load)]
+    num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1
+    print(num_batches)
+    load_cache = []
+    if jobs:
+        start_job_idx = 0
+        end_job_idx = BQ_MAX_ROW_LOAD_SIZE
+        for _ in range(num_batches):
+            load_cache.append(jobs[start_job_idx:end_job_idx])
+            start_job_idx = end_job_idx
+            end_job_idx += BQ_MAX_ROW_LOAD_SIZE
+    if jobs:
+        assert (
+            sum([sum(x) for x in load_cache])
+            == num_jobs_to_load * (num_jobs_to_load + 1) // 2
+        )
+    else:
+        assert sum([sum(x) for x in load_cache]) == 0

From 902ffd0ae29247e877f442f2b19273bab5c3c54b Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Mon, 7 Oct 2024 05:34:01 +0000
Subject: [PATCH 034/102] SlurmGCP. Do not put job_submit script into config.

* Make `job_submit` a part of "devel" zip;
* Move from `etc` to `scripts`;
* Remove `slurm_files` variable;
* Apply auto-formating, no other changes to the `job_submit.lua.tpl`.
---
 .../modules/slurm_files/README.md             |   1 -
 .../slurm_files/etc/job_submit.lua.tpl        | 102 -----------------
 .../modules/slurm_files/main.tf               |   1 -
 .../modules/slurm_files/scripts/conf.py       |  19 ++--
 .../slurm_files/scripts/job_submit.lua.tpl    | 103 ++++++++++++++++++
 .../modules/slurm_files/variables.tf          |   6 -
 6 files changed, 113 insertions(+), 119 deletions(-)
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl
 create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md
index 8cf7f3ade5..3033d59f43 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md
@@ -83,7 +83,6 @@ No modules.
 | <a name="input_extra_logging_flags"></a> [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no |
 | <a name="input_google_app_cred_path"></a> [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no |
 | <a name="input_install_dir"></a> [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the<br/>on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path<br/>for the resume and suspend scripts in the generated `cloud.conf` file.<br/><br/>This variable should be used when the TerraformHost and the SlurmctldHost<br/>are different.<br/><br/>This will default to var.output\_dir if null. | `string` | `null` | no |
-| <a name="input_job_submit_lua_tpl"></a> [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no |
 | <a name="input_login_network_storage"></a> [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances<br/>- server\_ip     : Address of the storage server.<br/>- remote\_mount  : The location in the remote instance filesystem to mount from.<br/>- local\_mount   : The location on the instance filesystem to mount to.<br/>- fs\_type       : Filesystem type (e.g. "nfs").<br/>- mount\_options : Options to mount with. | <pre>list(object({<br/>    server_ip     = string<br/>    remote_mount  = string<br/>    local_mount   = string<br/>    fs_type       = string<br/>    mount_options = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_login_startup_scripts"></a> [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. | <pre>list(object({<br/>    filename = string<br/>    content  = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_login_startup_scripts_timeout"></a> [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If<br/>any script exceeds this timeout, then the instance setup process is considered<br/>failed and handled accordingly.<br/><br/>NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl
deleted file mode 100644
index f3c9b0750e..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl
+++ /dev/null
@@ -1,102 +0,0 @@
-SCRIPTS_DIR = "{scripts_dir}"
-NO_VAL = 4294967294
---get_tpu_vmcount.py exit code
-PART_INVALID = -1 --partition does not exists in config.yaml, thus do not exist in slurm
-DIFF_VMCOUNTS_SAME_PART = -2 --in the same partition there are nodesets with different vmcounts
-DIFF_PART_DIFFERENT_VMCOUNTS = -3 --partition is a list of partitions in which at least two of them have different vmcount
-UNKWOWN_ERROR = -4 --get_tpu_vmcount.py did not return a valid response
-
-function get_part(job_desc,part_list)
-    if job_desc.partition then
-		return job_desc.partition
-    end
-    for name,val in pairs(part_list) do
-		if val.flag_default == 1 then
-			return name
-		end
-    end
-    return nil
-end
-
-function os.capture(cmd, raw)
-    local handle = assert(io.popen(cmd, 'r'))
-    local output = assert(handle:read('*a'))
-    handle:close()
-    return output
-end
-
-function get_vmcount(part)
-    local cmd = SCRIPTS_DIR .. "/get_tpu_vmcount.py -p " .. part
-    local out = os.capture(cmd,true)
-    for line in out:gmatch("(.-)\r?\n") do
-		local tag, val = line:match("([^:]+):([^:]+)")
-		if tag == "VMCOUNT" then
-			return tonumber(val)
-    	end
-	end
-	return UNKWOWN_ERROR
-end
-
-
-function slurm_job_submit(job_desc, part_list, submit_uid)
-    local part = get_part(job_desc,part_list)
-    local vmcount = get_vmcount(part)
-	--Only do something if the job is in a TPU partition, if vmcount is 0, it implies that the partition(s) specified are not TPU ones
-	if vmcount == 0 then
-		return slurm.SUCCESS
-	end
-	--This is a TPU job, but as the vmcount is 1 it can he handled the same way
-	if vmcount == 1 then
-		return slurm.SUCCESS
-	end
-	--Check for errors
-    if vmcount == PART_INVALID then
-	    slurm.log_user("Invalid partition specified " .. part)
-	    return slurm.FAILURE
-    end
-    if vmcount == DIFF_VMCOUNTS_SAME_PART then
-	    slurm.log_user("In partition(s) " .. part .. " there are more than one tpu nodeset vmcount, this should not happen.")
-	    return slurm.ERROR
-    end
-    if vmcount == DIFF_PART_DIFFERENT_VMCOUNTS then
-	    slurm.log_user("In partition list " .. part .. " there are more than one TPU types, cannot determine which is the correct vmcount to use, please retry with only one partition.")
-	    return slurm.FAILURE
-    end
-	if vmcount == UNKWOWN_ERROR then
-	    slurm.log_user("Something went wrong while executing get_tpu_vmcount.py.")
-	    return slurm.ERROR
-    end
-    --This is surely a TPU node
-    if vmcount > 1 then
-    	local min_nodes = job_desc.min_nodes
-    	local max_nodes = job_desc.max_nodes
-		--if not specified assume it is one, this should be improved taking into account the cpus, mem, and other factors
-    	if min_nodes == NO_VAL then
-			min_nodes = 1
-			max_nodes = 1
-		end
-		--as max_nodes can be higher than the nodes in the partition, we are not able to calculate with certainty the nodes that this job will have if this value is set to something
-		--different than min_nodes
-		if min_nodes ~= max_nodes then
-			slurm.log_user("Max nodes cannot be set different than min nodes for the TPU partitions.")
-			return slurm.ERROR
-		end
-		--Set the number of switches to the number of nodes originally requested by the job, as the job requests "TPU groups"
-		job_desc.req_switch = min_nodes
-
-		--Apply the node increase into the job description.
-		job_desc.min_nodes = min_nodes * vmcount
-		job_desc.max_nodes = max_nodes * vmcount
-		--if job_desc.features then
-			--slurm.log_user("Features: %s",job_desc.features)
-		--end
-	end
-
-    return slurm.SUCCESS
-end
-
-function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
-   return slurm.SUCCESS
-end
-
-return slurm.SUCCESS
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf
index 0cf9981f5a..959d928176 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf
@@ -84,7 +84,6 @@ locals {
     slurmdbd_conf_tpl = file(coalesce(var.slurmdbd_conf_tpl, "${local.etc_dir}/slurmdbd.conf.tpl"))
     slurm_conf_tpl    = file(coalesce(var.slurm_conf_tpl, "${local.etc_dir}/slurm.conf.tpl"))
     cgroup_conf_tpl   = file(coalesce(var.cgroup_conf_tpl, "${local.etc_dir}/cgroup.conf.tpl"))
-    jobsubmit_lua_tpl = file(coalesce(var.job_submit_lua_tpl, "${local.etc_dir}/job_submit.lua.tpl"))
 
     # Providers
     endpoint_versions = var.endpoint_versions
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index c3b31f20a2..29b4076056 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -360,19 +360,20 @@ def install_cgroup_conf(lkp: util.Lookup) -> None:
 
 def install_jobsubmit_lua(lkp: util.Lookup) -> None:
     """install job_submit.lua if there are tpu nodes in the cluster"""
-    if any(
+    if not any(
         tpu_nodeset is not None
         for part in lkp.cfg.partitions.values()
         for tpu_nodeset in part.partition_nodeset_tpu
     ):
-        conf_options = {
-            "scripts_dir": lkp.cfg.slurm_scripts_dir or dirs.scripts,
-        }
-        conf = lkp.cfg.jobsubmit_lua_tpl.format(**conf_options)
-
-        conf_file = lkp.etc_dir / "job_submit.lua"
-        conf_file.write_text(conf)
-        util.chown_slurm(conf_file, 0o600)
+        return # No TPU partitions, no need for job_submit.lua
+    
+    scripts_dir = lkp.cfg.slurm_scripts_dir or dirs.scripts
+    tpl = (scripts_dir / "job_submit.lua.tpl").read_text()
+    conf = tpl.format(scripts_dir=scripts_dir)
+
+    conf_file = lkp.etc_dir / "job_submit.lua"
+    conf_file.write_text(conf)
+    util.chown_slurm(conf_file, 0o600)
 
 
 def gen_cloud_gres_conf(lkp: util.Lookup) -> None:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl
new file mode 100644
index 0000000000..810a0742b0
--- /dev/null
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl
@@ -0,0 +1,103 @@
+SCRIPTS_DIR = "{scripts_dir}"
+NO_VAL = 4294967294
+-- get_tpu_vmcount.py exit code
+PART_INVALID = -1 -- partition does not exists in config.yaml, thus do not exist in slurm
+DIFF_VMCOUNTS_SAME_PART = -2 -- in the same partition there are nodesets with different vmcounts
+DIFF_PART_DIFFERENT_VMCOUNTS = -3 -- partition is a list of partitions in which at least two of them have different vmcount
+UNKWOWN_ERROR = -4 -- get_tpu_vmcount.py did not return a valid response
+
+function get_part(job_desc, part_list)
+    if job_desc.partition then
+        return job_desc.partition
+    end
+    for name, val in pairs(part_list) do
+        if val.flag_default == 1 then
+            return name
+        end
+    end
+    return nil
+end
+
+function os.capture(cmd, raw)
+    local handle = assert(io.popen(cmd, 'r'))
+    local output = assert(handle:read('*a'))
+    handle:close()
+    return output
+end
+
+function get_vmcount(part)
+    local cmd = SCRIPTS_DIR .. "/get_tpu_vmcount.py -p " .. part
+    local out = os.capture(cmd, true)
+    for line in out:gmatch("(.-)\r?\n") do
+        local tag, val = line:match("([^:]+):([^:]+)")
+        if tag == "VMCOUNT" then
+            return tonumber(val)
+        end
+    end
+    return UNKWOWN_ERROR
+end
+
+function slurm_job_submit(job_desc, part_list, submit_uid)
+    local part = get_part(job_desc, part_list)
+    local vmcount = get_vmcount(part)
+    -- Only do something if the job is in a TPU partition, if vmcount is 0, it implies that the partition(s) specified are not TPU ones
+    if vmcount == 0 then
+        return slurm.SUCCESS
+    end
+    -- This is a TPU job, but as the vmcount is 1 it can he handled the same way
+    if vmcount == 1 then
+        return slurm.SUCCESS
+    end
+    -- Check for errors
+    if vmcount == PART_INVALID then
+        slurm.log_user("Invalid partition specified " .. part)
+        return slurm.FAILURE
+    end
+    if vmcount == DIFF_VMCOUNTS_SAME_PART then
+        slurm.log_user("In partition(s) " .. part ..
+                           " there are more than one tpu nodeset vmcount, this should not happen.")
+        return slurm.ERROR
+    end
+    if vmcount == DIFF_PART_DIFFERENT_VMCOUNTS then
+        slurm.log_user("In partition list " .. part ..
+                           " there are more than one TPU types, cannot determine which is the correct vmcount to use, please retry with only one partition.")
+        return slurm.FAILURE
+    end
+    if vmcount == UNKWOWN_ERROR then
+        slurm.log_user("Something went wrong while executing get_tpu_vmcount.py.")
+        return slurm.ERROR
+    end
+    -- This is surely a TPU node
+    if vmcount > 1 then
+        local min_nodes = job_desc.min_nodes
+        local max_nodes = job_desc.max_nodes
+        -- if not specified assume it is one, this should be improved taking into account the cpus, mem, and other factors
+        if min_nodes == NO_VAL then
+            min_nodes = 1
+            max_nodes = 1
+        end
+        -- as max_nodes can be higher than the nodes in the partition, we are not able to calculate with certainty the nodes that this job will have if this value is set to something
+        -- different than min_nodes
+        if min_nodes ~= max_nodes then
+            slurm.log_user("Max nodes cannot be set different than min nodes for the TPU partitions.")
+            return slurm.ERROR
+        end
+        -- Set the number of switches to the number of nodes originally requested by the job, as the job requests "TPU groups"
+        job_desc.req_switch = min_nodes
+
+        -- Apply the node increase into the job description.
+        job_desc.min_nodes = min_nodes * vmcount
+        job_desc.max_nodes = max_nodes * vmcount
+        -- if job_desc.features then
+        -- slurm.log_user("Features: %s",job_desc.features)
+        -- end
+    end
+
+    return slurm.SUCCESS
+end
+
+function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
+    return slurm.SUCCESS
+end
+
+return slurm.SUCCESS
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
index 2c01b6b579..91026fc267 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf
@@ -94,12 +94,6 @@ variable "cgroup_conf_tpl" {
   default     = null
 }
 
-variable "job_submit_lua_tpl" {
-  type        = string
-  description = "Slurm job_submit.lua template file path."
-  default     = null
-}
-
 variable "cloudsql_secret" {
   description = "Secret URI to cloudsql secret."
   type        = string

From 9f39ad96161490ecc12c271fc33d795948101f44 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Mon, 7 Oct 2024 11:59:32 -0500
Subject: [PATCH 035/102] Do not trigger label validation on draft pull
 requests

---
 .github/workflows/pr-label-validation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml
index df54a6e150..a4c23aa998 100644
--- a/.github/workflows/pr-label-validation.yml
+++ b/.github/workflows/pr-label-validation.yml
@@ -34,7 +34,7 @@ on:
 
 jobs:
   pr-label-validation:
-    if: github.repository == 'GoogleCloudPlatform/cluster-toolkit'
+    if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' && github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     permissions:
       pull-requests: read

From 4fd6f8a3bb934fbd9bc23eb9f0c2c211e46aeebf Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 18:10:29 +0000
Subject: [PATCH 036/102] Clean up big query load.

---
 .../modules/slurm_files/scripts/load_bq.py    | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 9967069212..7bb8dc440b 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -15,6 +15,7 @@
 
 
 import argparse
+import math
 import os
 import shelve
 import uuid
@@ -23,11 +24,10 @@
 from pathlib import Path
 from pprint import pprint
 
+import util
 from google.api_core import exceptions, retry
 from google.cloud import bigquery as bq
 from google.cloud.bigquery import SchemaField
-
-import util
 from util import lookup, run
 
 SACCT = "sacct"
@@ -35,7 +35,9 @@
 
 DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp"
 timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE))
-BQ_MAX_ROW_LOAD_SIZE = 10000
+# The maximum request to insert_rows is 10MB, each sacct row is about 1200 KB or ~ 8000 rows.
+# Set to 5000 for a little wiggle room.
+BQ_ROW_BATCH_SIZE = 5000
 
 # cluster_id_file = script.parent / 'cluster_uuid'
 # try:
@@ -282,6 +284,26 @@ def bq_submit(jobs):
     print(f"successfully loaded {len(jobs)} jobs")
 
 
+def batched_bq_submit(
+    client, table, jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE
+):
+    """Submit sacct data in batches of size bq_row_batch_size
+
+    Args:
+        jobs: A list of dictionaries of sacct accounting data.
+        submit_function: The method to submit the jobs to BigQuery with. Defaults to bq_submit.
+        bq_row_batch_size: The accounting data will be submitted to BigQuery in
+            batches of this size.
+    """
+    num_batches = int(math.ceil(len(jobs) / bq_row_batch_size))
+    print(
+        f"loading {num_batches} batches of BigQuery data in batches of size : {bq_row_batch_size}"
+    )
+    for indx in range(0, len(jobs), bq_row_batch_size):
+        print(f"loading BigQuery data batch {indx} of {num_batches}")
+        submit_function(client, jobs[indx : indx + bq_row_batch_size])
+
+
 def get_time_window():
     if not timestamp_file.is_file():
         timestamp_file.touch()
@@ -321,16 +343,9 @@ def main():
     # on failure, an exception will cause the timestamp not to be rewritten. So
     # it will try again next time. If some writes succeed, we don't currently
     # have a way to not submit duplicates next time.
-    print(f"loading BigQuery data in batches of size : {BQ_MAX_ROW_LOAD_SIZE}")
-    num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1
-    print(f"Number of batches: {num_batches}")
     if jobs:
-        start_job_idx = 0
-        end_job_idx = BQ_MAX_ROW_LOAD_SIZE
-        for _ in range(num_batches):
-            bq_submit(jobs[start_job_idx:end_job_idx])
-            start_job_idx = end_job_idx
-            end_job_idx += BQ_MAX_ROW_LOAD_SIZE
+        batched_bq_submit(client, table, jobs)
+
     write_timestamp(end)
     update_job_idx_cache(jobs, end)
 

From b235b474744a56bc7b27a4495b0bc82a4294c392 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 18:12:08 +0000
Subject: [PATCH 037/102] Clean up big query load

---
 .../modules/slurm_files/scripts/load_bq.py                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 7bb8dc440b..ba10905b40 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -285,7 +285,7 @@ def bq_submit(jobs):
 
 
 def batched_bq_submit(
-    client, table, jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE
+    jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE
 ):
     """Submit sacct data in batches of size bq_row_batch_size
 

From c5c55048c307142bb11100af7d638217867cb146 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 18:50:26 +0000
Subject: [PATCH 038/102] Don't add a new method.

---
 .../modules/slurm_files/scripts/load_bq.py    | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index ba10905b40..4540d99a4e 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -284,26 +284,6 @@ def bq_submit(jobs):
     print(f"successfully loaded {len(jobs)} jobs")
 
 
-def batched_bq_submit(
-    jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE
-):
-    """Submit sacct data in batches of size bq_row_batch_size
-
-    Args:
-        jobs: A list of dictionaries of sacct accounting data.
-        submit_function: The method to submit the jobs to BigQuery with. Defaults to bq_submit.
-        bq_row_batch_size: The accounting data will be submitted to BigQuery in
-            batches of this size.
-    """
-    num_batches = int(math.ceil(len(jobs) / bq_row_batch_size))
-    print(
-        f"loading {num_batches} batches of BigQuery data in batches of size : {bq_row_batch_size}"
-    )
-    for indx in range(0, len(jobs), bq_row_batch_size):
-        print(f"loading BigQuery data batch {indx} of {num_batches}")
-        submit_function(client, jobs[indx : indx + bq_row_batch_size])
-
-
 def get_time_window():
     if not timestamp_file.is_file():
         timestamp_file.touch()
@@ -344,8 +324,13 @@ def main():
     # it will try again next time. If some writes succeed, we don't currently
     # have a way to not submit duplicates next time.
     if jobs:
-        batched_bq_submit(client, table, jobs)
-
+        num_batches = math.ceil(len(jobs) / BQ_ROW_BATCH_SIZE)
+        print(
+            f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}"
+        )
+        for indx in range(0, len(jobs), BQ_ROW_BATCH_SIZE):
+            print(f"loading BigQuery data batch {indx} of {num_batches}")
+            bq_submit(jobs[indx : indx + BQ_ROW_BATCH_SIZE])
     write_timestamp(end)
     update_job_idx_cache(jobs, end)
 

From 6699d000ce2d33893daa35484f1d8c07fbef44d6 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 18:55:56 +0000
Subject: [PATCH 039/102] Remove test

---
 .../slurm_files/scripts/tests/test_load_bq.py | 39 -------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py
deleted file mode 100644
index ebe45008a0..0000000000
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024 "Google LLC"
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-
-# test the chunking logic if not the BigQuery loads themselves
-@pytest.mark.parametrize("num_jobs_to_load", (0, 11, 10001, 51131, 104321))
-def test_chunked_bq_load(num_jobs_to_load: int):
-    BQ_MAX_ROW_LOAD_SIZE = 10000
-    jobs = [i + 1 for i in range(num_jobs_to_load)]
-    num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1
-    print(num_batches)
-    load_cache = []
-    if jobs:
-        start_job_idx = 0
-        end_job_idx = BQ_MAX_ROW_LOAD_SIZE
-        for _ in range(num_batches):
-            load_cache.append(jobs[start_job_idx:end_job_idx])
-            start_job_idx = end_job_idx
-            end_job_idx += BQ_MAX_ROW_LOAD_SIZE
-    if jobs:
-        assert (
-            sum([sum(x) for x in load_cache])
-            == num_jobs_to_load * (num_jobs_to_load + 1) // 2
-        )
-    else:
-        assert sum([sum(x) for x in load_cache]) == 0

From 546bcd8c19ec5825ca9afcfb256c067a1da42187 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 18:57:00 +0000
Subject: [PATCH 040/102] Fix comment

---
 .../modules/slurm_files/scripts/load_bq.py                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 4540d99a4e..f3e86bdf2f 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -35,7 +35,7 @@
 
 DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp"
 timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE))
-# The maximum request to insert_rows is 10MB, each sacct row is about 1200 KB or ~ 8000 rows.
+# The maximum request to insert_rows is 10MB, each sacct row is about 1200 bytes or ~ 8000 rows.
 # Set to 5000 for a little wiggle room.
 BQ_ROW_BATCH_SIZE = 5000
 

From ad6abc120573a8ade23f76ba262a196251c90236 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 20:03:15 +0000
Subject: [PATCH 041/102] Use integer arithmetic for num batches

---
 .../modules/slurm_files/scripts/load_bq.py                     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index f3e86bdf2f..0f14e06794 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -15,7 +15,6 @@
 
 
 import argparse
-import math
 import os
 import shelve
 import uuid
@@ -324,7 +323,7 @@ def main():
     # it will try again next time. If some writes succeed, we don't currently
     # have a way to not submit duplicates next time.
     if jobs:
-        num_batches = math.ceil(len(jobs) / BQ_ROW_BATCH_SIZE)
+        num_batches = (len(jobs) - 1) // BQ_ROW_BATCH_SIZE + 1
         print(
             f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}"
         )

From e142952a6532df68e0719189eee95da59ac4ac23 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 20:36:11 +0000
Subject: [PATCH 042/102] Accurate logging

---
 .../modules/slurm_files/scripts/load_bq.py                | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 0f14e06794..8a6c59eaf2 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -327,9 +327,11 @@ def main():
         print(
             f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}"
         )
-        for indx in range(0, len(jobs), BQ_ROW_BATCH_SIZE):
-            print(f"loading BigQuery data batch {indx} of {num_batches}")
-            bq_submit(jobs[indx : indx + BQ_ROW_BATCH_SIZE])
+        for batch_indx, job_indx in enumerate(range(0, len(jobs), BQ_ROW_BATCH_SIZE)):
+            print(
+                f"loading BigQuery data batch {batch_indx} of {num_batches}. Loading rows {job_indx} to {job_indx + BQ_ROW_BATCH_SIZE}"
+            )
+            bq_submit(jobs[job_indx : job_indx + BQ_ROW_BATCH_SIZE])
     write_timestamp(end)
     update_job_idx_cache(jobs, end)
 

From 69db3d6f8ae6a1b470b4dc11902d00899b36cb37 Mon Sep 17 00:00:00 2001
From: Fionn Malone <fmalone@google.com>
Date: Mon, 7 Oct 2024 20:37:55 +0000
Subject: [PATCH 043/102] Shorten log message

---
 .../modules/slurm_files/scripts/load_bq.py                    | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
index 8a6c59eaf2..77df35748f 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py
@@ -328,9 +328,7 @@ def main():
             f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}"
         )
         for batch_indx, job_indx in enumerate(range(0, len(jobs), BQ_ROW_BATCH_SIZE)):
-            print(
-                f"loading BigQuery data batch {batch_indx} of {num_batches}. Loading rows {job_indx} to {job_indx + BQ_ROW_BATCH_SIZE}"
-            )
+            print(f"loading BigQuery data batch {batch_indx} of {num_batches}")
             bq_submit(jobs[job_indx : job_indx + BQ_ROW_BATCH_SIZE])
     write_timestamp(end)
     update_job_idx_cache(jobs, end)

From 28daa5ddc8752d1abef9eda7daf520066c0e090f Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Tue, 8 Oct 2024 00:32:55 +0000
Subject: [PATCH 044/102] add GKE support for parallelstore through gke-storage
 module

---
 examples/gke-storage-parallelstore.yaml       | 105 ++++++++++++++
 modules/file-system/gke-storage/README.md     | 129 +++++++++++++++++
 modules/file-system/gke-storage/main.tf       |  78 ++++++++++
 modules/file-system/gke-storage/metadata.yaml |  18 +++
 modules/file-system/gke-storage/outputs.tf    |  27 ++++
 .../parallelstore-pvc.yaml.tftpl              |  15 ++
 .../storage-class/parallelstore-sc.yaml.tftpl |  21 +++
 modules/file-system/gke-storage/variables.tf  | 134 ++++++++++++++++++
 modules/file-system/gke-storage/versions.tf   |  21 +++
 modules/scheduler/gke-cluster/README.md       |  20 +--
 modules/scheduler/gke-cluster/main.tf         |  11 ++
 modules/scheduler/gke-cluster/variables.tf    |   6 +
 modules/scheduler/gke-cluster/versions.tf     |   4 +
 .../test-gke-storage-parallelstore.yml        |  41 ++++++
 .../builds/gke-storage-parallelstore.yaml     |  60 ++++++++
 .../tests/gke-storage-parallelstore.yml       |  28 ++++
 16 files changed, 710 insertions(+), 8 deletions(-)
 create mode 100644 examples/gke-storage-parallelstore.yaml
 create mode 100644 modules/file-system/gke-storage/README.md
 create mode 100644 modules/file-system/gke-storage/main.tf
 create mode 100644 modules/file-system/gke-storage/metadata.yaml
 create mode 100644 modules/file-system/gke-storage/outputs.tf
 create mode 100644 modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl
 create mode 100644 modules/file-system/gke-storage/variables.tf
 create mode 100644 modules/file-system/gke-storage/versions.tf
 create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml
 create mode 100644 tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml

diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml
new file mode 100644
index 0000000000..daecc6657e
--- /dev/null
+++ b/examples/gke-storage-parallelstore.yaml
@@ -0,0 +1,105 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+blueprint_name: gke-storage-parallelstore
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: gke-storage-parallelstore
+  region: us-central1
+  zone: us-central1-c
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: <your-ip-address>/32
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network
+    source: modules/network/vpc
+    settings:
+      subnetwork_name: gke-subnet-parallelstore
+      secondary_ranges:
+        gke-subnet-parallelstore:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: private_service_access # required for parallelstore
+    source: community/modules/network/private-service-access
+    use: [network]
+    settings:
+      prefix_length: 24
+
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network]
+    settings:
+      enable_parallelstore_csi: true # enable Parallelstore for the cluster
+      configure_workload_identity_sa: true
+      enable_private_endpoint: false  # Allows for access from authorized public IPs
+      master_authorized_networks:
+      - display_name: deployment-machine
+        cidr_block: $(vars.authorized_cidr)
+    outputs: [instructions]
+
+  ### Set up storage class and persistent volume claim for Parallelstore ###
+  - id: parallelstore-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster, private_service_access]
+    settings:
+      storage_type: Parallelstore
+      access_mode: ReadWriteMany
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 2
+      capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB
+
+  - id: sample-pool
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster]
+    settings:
+      name: sample-pool
+      zones: [$(vars.zone)]
+      machine_type: n2-standard-4
+
+  ### Parallelstore enabled Job ###
+
+  - id: parallelstore-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - parallelstore-setup
+    settings:
+      image: busybox
+      command:
+      - bin/sh
+      - -c
+      - |
+        echo "Set up job folders"
+        shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])}
+        mkdir /data/parallelstore-pvc-0/${JOB}/ -p;
+        mkdir /data/parallelstore-pvc-1/${JOB}/ -p;
+
+        echo "Writing seed data to Parallelstore volumes"
+        dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
+        dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
+
+        # echo "Hash file and write between the 2 hyerpdisk balanced volumes"
+        # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5
+        # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5
+      node_count: 5
+    outputs: [instructions]
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
new file mode 100644
index 0000000000..7fbef919a4
--- /dev/null
+++ b/modules/file-system/gke-storage/README.md
@@ -0,0 +1,129 @@
+## Description
+
+This module creates Kubernetes Storage Class (SC) that can be used by a Persistent Volume Claim (PVC)
+to dynamically provision GCP storage resources like Parallelstore.
+
+### Example
+
+The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Peresistent Volume Claim,
+then use them in a `gke-job-template` to dynamically provision the resource.
+
+```yaml
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network]
+    settings:
+      enable_parallelstore_csi: true
+
+  - id: private_service_access
+    source: community/modules/network/private-service-access
+    use: [network]
+    settings:
+      prefix_length: 24
+
+  - id: gke_storage
+    source: modules/file-system/gke-storage
+    use: [ gke_cluster, private_service_access ]
+    settings:
+      storage_type: Parallelstore
+      access_mode: ReadWriteMany
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 2
+      capacity_gb: 12000
+
+  - id: job_template
+    source: modules/compute/gke-job-template
+    use: [gke_storage, compute_pool]
+```
+
+See example
+[gke-storage-parallelstore.yaml](../../../examples/README.md#gke-storage-parallelstoreyaml--) blueprint
+for a complete example.
+
+### Authorized Network
+
+Since the `gke-storage` module is making calls to the Kubernetes API
+to create Kubernetes entities, the machine performing the deployment must be
+authorized to connect to the Kubernetes API. You can add the
+`master_authorized_networks` settings block, as shown in the example above, with
+the IP address of the machine performing the deployment. This will ensure that
+the deploying machine can connect to the cluster.
+
+### Connecting Via Use
+
+The diagram below shows the valid `use` relationships for the GKE Cluster Toolkit
+modules. For example the `gke-storage` module can `use` a
+`gke-cluster` module and a `private_service_access` module, as shown in the example above.
+
+```mermaid
+graph TD;
+    vpc-->|OneToMany|gke-cluster;
+    gke-cluster-->|OneToMany|gke-node-pool;
+    gke-node-pool-->|ManyToMany|gke-job-template;
+    gke-cluster-->|OneToMany|gke-storage;
+    gke-storage-->|ManyToMany|gke-job-template;
+```
+
+## License
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0 |
+
+## Providers
+
+No providers.
+
+## Modules
+
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_kubectl_apply"></a> [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a |
+
+## Resources
+
+No resources.
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_access_mode"></a> [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)<br>Valid access modes:<br>- ReadWriteOnce<br>- ReadOnlyMany<br>- ReadWriteMany<br>- ReadWriteOncePod | `string` | n/a | yes |
+| <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
+| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
+| <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
+| <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no |
+| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.<br>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
+| <a name="input_pv_mount_path"></a> [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no |
+| <a name="input_pvc_count"></a> [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no |
+| <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br>Supported value:<br>- Retain<br>- Delete | `string` | n/a | yes |
+| <a name="input_sc_topology_zones"></a> [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no |
+| <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br>Supported value:<br>- Immediate<br>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
+| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br>to used. This module currently support dynamic provisioning for the below storage options<br>- Parallelstore<br>- Hyperdisk-balanced<br>- Hyperdisk-throughput<br>- Hyperdisk-extreme | `string` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_persistent_volume_claims"></a> [persistent\_volume\_claims](#output\_persistent\_volume\_claims) | An object that describes a k8s PVC created by this module. |
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/file-system/gke-storage/main.tf b/modules/file-system/gke-storage/main.tf
new file mode 100644
index 0000000000..18f85fa779
--- /dev/null
+++ b/modules/file-system/gke-storage/main.tf
@@ -0,0 +1,78 @@
+/**
+  * Copyright 2024 Google LLC
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *      http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+locals {
+  # This label allows for billing report tracking based on module.
+  labels = merge(var.labels, { ghpc_module = "gke-storage", ghpc_role = "file-system" })
+}
+
+locals {
+  storage_type       = lower(var.storage_type)
+  storage_class_name = "${local.storage_type}-sc"
+  pvc_name_prefix    = "${local.storage_type}-pvc"
+}
+
+check "private_vpc_connection_peering" {
+  assert {
+    condition     = lower(var.storage_type) != "parallelstore" ? true : var.private_vpc_connection_peering != null
+    error_message = <<-EOT
+    Parallelstore must be run within the same VPC as the GKE cluster and have private services access enabled.
+    If using new VPC, please use community/modules/network/private-service-access to create private-service-access.
+    If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc).
+    EOT
+  }
+}
+
+module "kubectl_apply" {
+  source = "../../management/kubectl-apply"
+
+  cluster_id = var.cluster_id
+  project_id = var.project_id
+
+  # count = var.pvc_count
+  apply_manifests = flatten(
+    [
+      # create StorageClass in the cluster
+      {
+        content = templatefile(
+          "${path.module}/storage-class/${local.storage_class_name}.yaml.tftpl",
+          {
+            name                = local.storage_class_name
+            labels              = local.labels
+            volume_binding_mode = var.sc_volume_binding_mode
+            reclaim_policy      = var.sc_reclaim_policy
+            topology_zones      = var.sc_topology_zones
+        })
+      },
+      # create PersistentVolumeClaim in the cluster
+      flatten([
+        for idx in range(var.pvc_count) : [
+          {
+            content = templatefile(
+              "${path.module}/persistent-volume-claim/${(local.pvc_name_prefix)}.yaml.tftpl",
+              {
+                pvc_name           = "${local.pvc_name_prefix}-${idx}"
+                labels             = local.labels
+                capacity           = "${var.capacity_gb}Gi"
+                access_mode        = var.access_mode
+                storage_class_name = local.storage_class_name
+              }
+            )
+          }
+        ]
+      ])
+  ])
+}
diff --git a/modules/file-system/gke-storage/metadata.yaml b/modules/file-system/gke-storage/metadata.yaml
new file mode 100644
index 0000000000..8722823274
--- /dev/null
+++ b/modules/file-system/gke-storage/metadata.yaml
@@ -0,0 +1,18 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+spec:
+  requirements:
+    services: []
diff --git a/modules/file-system/gke-storage/outputs.tf b/modules/file-system/gke-storage/outputs.tf
new file mode 100644
index 0000000000..b789674814
--- /dev/null
+++ b/modules/file-system/gke-storage/outputs.tf
@@ -0,0 +1,27 @@
+/**
+  * Copyright 2024 Google LLC
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *      http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+output "persistent_volume_claims" {
+  description = "An object that describes a k8s PVC created by this module."
+  value = flatten([
+    for idx in range(var.pvc_count) : [{
+      name          = "${local.pvc_name_prefix}-${idx}"
+      mount_path    = "${var.pv_mount_path}/${local.pvc_name_prefix}-${idx}"
+      mount_options = var.mount_options
+      is_gcs        = false
+    }]
+  ])
+}
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl
new file mode 100644
index 0000000000..32781be2fb
--- /dev/null
+++ b/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl
new file mode 100644
index 0000000000..e6b8ea8d3e
--- /dev/null
+++ b/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl
@@ -0,0 +1,21 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+provisioner: parallelstore.csi.storage.gke.io
+parameters:
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
new file mode 100644
index 0000000000..97ff1af21b
--- /dev/null
+++ b/modules/file-system/gke-storage/variables.tf
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+variable "project_id" {
+  description = "The project ID to host the cluster in."
+  type        = string
+}
+
+variable "cluster_id" {
+  description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`"
+  type        = string
+}
+
+variable "labels" {
+  description = "GCE resource labels to be applied to resources. Key-value pairs."
+  type        = map(string)
+}
+
+variable "storage_type" {
+  description = <<-EOT
+  The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
+  to used. This module currently support dynamic provisioning for the below storage options
+  - Parallelstore
+  - Hyperdisk-balanced
+  - Hyperdisk-throughput
+  - Hyperdisk-extreme
+  EOT 
+  type        = string
+  nullable    = false
+  validation {
+    condition     = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type))
+    error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"."
+  }
+}
+
+variable "access_mode" {
+  description = <<-EOT
+  The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)
+  Valid access modes:
+  - ReadWriteOnce
+  - ReadOnlyMany
+  - ReadWriteMany
+  - ReadWriteOncePod
+  EOT 
+  type        = string
+  nullable    = false
+  validation {
+    condition     = var.access_mode == null ? false : contains(["readwriteonce", "readonlymany", "readwritemany", "readwriteoncepod"], lower(var.access_mode))
+    error_message = "Allowed string values for var.access_mode are \"ReadWriteOnce\", \"ReadOnlyMany\", \"ReadWriteMany\", \"ReadWriteOncePod\"."
+  }
+}
+
+variable "sc_volume_binding_mode" {
+  description = <<-EOT
+  Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
+  Supported value:
+  - Immediate
+  - WaitForFirstConsumer
+  EOT
+  type        = string
+  default     = "WaitForFirstConsumer"
+  validation {
+    condition     = var.sc_volume_binding_mode == null ? true : contains(["immediate", "waitforfirstconsumer"], lower(var.sc_volume_binding_mode))
+    error_message = "Allowed string values for var.sc_volume_binding_mode are \"Immediate\", \"WaitForFirstConsumer\"."
+  }
+}
+
+variable "sc_reclaim_policy" {
+  description = <<-EOT
+  Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
+  [More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
+  Supported value:
+  - Retain
+  - Delete
+  EOT
+  type        = string
+  nullable    = false
+  validation {
+    condition     = var.sc_reclaim_policy == null ? true : contains(["retain", "delete"], lower(var.sc_reclaim_policy))
+    error_message = "Allowed string values for var.sc_reclaim_policy are \"Retain\", \"Delete\"."
+  }
+}
+
+variable "sc_topology_zones" {
+  description = "Zone location that allow the volumes to be dynamically provisioned."
+  type        = list(string)
+  default     = null
+}
+
+variable "pvc_count" {
+  description = "How many PersistentVolumeClaims that will be created"
+  type        = number
+  default     = 1
+}
+
+variable "pv_mount_path" {
+  description = "Path within the container at which the volume should be mounted. Must not contain ':'."
+  type        = string
+  default     = "/data"
+}
+
+variable "mount_options" {
+  description = "Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class."
+  type        = string
+  default     = null
+}
+
+variable "capacity_gb" {
+  description = "The storage capacity with which to create the persistent volume."
+  type        = number
+}
+
+variable "private_vpc_connection_peering" {
+  description = <<-EOT
+    The name of the VPC Network peering connection.
+    If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
+    If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc).
+    EOT
+  type        = string
+  default     = null
+}
diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf
new file mode 100644
index 0000000000..0a1082c515
--- /dev/null
+++ b/modules/file-system/gke-storage/versions.tf
@@ -0,0 +1,21 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_version = ">= 1.0"
+
+  provider_meta "google" {
+    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.39.0"
+  }
+}
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 583af203da..554517f8d4 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -110,6 +110,7 @@ limitations under the License.
 | <a name="requirement_google"></a> [google](#requirement\_google) | > 5.0 |
 | <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | > 5.0 |
 | <a name="requirement_kubernetes"></a> [kubernetes](#requirement\_kubernetes) | ~> 2.23 |
+| <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
 
 ## Providers
 
@@ -117,6 +118,7 @@ limitations under the License.
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | > 5.0 |
 | <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | > 5.0 |
+| <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
 
 ## Modules
 
@@ -137,6 +139,7 @@ limitations under the License.
 | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
+| [null_resource.enable_parallelstore_csi](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
 | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
 
@@ -144,7 +147,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
 | <a name="input_authenticator_security_group"></a> [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no |
 | <a name="input_autoscaling_profile"></a> [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no |
 | <a name="input_configure_workload_identity_sa"></a> [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no |
@@ -154,15 +157,16 @@ limitations under the License.
 | <a name="input_enable_gcsfuse_csi"></a> [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no |
 | <a name="input_enable_master_global_access"></a> [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no |
 | <a name="input_enable_multi_networking"></a> [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional\_networks are passed in. | `bool` | `null` | no |
+| <a name="input_enable_parallelstore_csi"></a> [enable\_parallelstore\_csi](#input\_enable\_parallelstore\_csi) | The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. | `bool` | `false` | no |
 | <a name="input_enable_persistent_disk_csi"></a> [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no |
 | <a name="input_enable_private_endpoint"></a> [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no |
 | <a name="input_enable_private_ipv6_google_access"></a> [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no |
 | <a name="input_enable_private_nodes"></a> [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no |
 | <a name="input_gcp_public_cidrs_access_enabled"></a> [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_maintenance_exclusions"></a> [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. | <pre>list(object({<br/>    name            = string<br/>    start_time      = string<br/>    end_time        = string<br/>    exclusion_scope = string<br/>  }))</pre> | `[]` | no |
+| <a name="input_maintenance_exclusions"></a> [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. | <pre>list(object({<br>    name            = string<br>    start_time      = string<br>    end_time        = string<br>    exclusion_scope = string<br>  }))</pre> | `[]` | no |
 | <a name="input_maintenance_start_time"></a> [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no |
-| <a name="input_master_authorized_networks"></a> [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. | <pre>list(object({<br/>    cidr_block   = string<br/>    display_name = string<br/>  }))</pre> | `[]` | no |
+| <a name="input_master_authorized_networks"></a> [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. | <pre>list(object({<br>    cidr_block   = string<br>    display_name = string<br>  }))</pre> | `[]` | no |
 | <a name="input_master_ipv4_cidr_block"></a> [master\_ipv4\_cidr\_block](#input\_master\_ipv4\_cidr\_block) | (Beta) The IP range in CIDR notation to use for the hosted master network. | `string` | `"172.16.0.32/28"` | no |
 | <a name="input_min_master_version"></a> [min\_master\_version](#input\_min\_master\_version) | The minimum version of the master. If unset, the cluster's version will be set by GKE to the version of the most recent official release. | `string` | `null` | no |
 | <a name="input_name_suffix"></a> [name\_suffix](#input\_name\_suffix) | Custom cluster name postpended to the `deployment_name`. See `prefix_with_deployment_name`. | `string` | `""` | no |
@@ -172,19 +176,19 @@ limitations under the License.
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_region"></a> [region](#input\_region) | The region to host the cluster in. | `string` | n/a | yes |
 | <a name="input_release_channel"></a> [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. | `string` | `"UNSPECIFIED"` | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the system node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
 | <a name="input_services_ip_range_name"></a> [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no |
 | <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes |
 | <a name="input_system_node_pool_enable_secure_boot"></a> [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_system_node_pool_enabled"></a> [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no |
 | <a name="input_system_node_pool_image_type"></a> [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
-| <a name="input_system_node_pool_kubernetes_labels"></a> [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_system_node_pool_kubernetes_labels"></a> [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_system_node_pool_machine_type"></a> [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no |
 | <a name="input_system_node_pool_name"></a> [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no |
-| <a name="input_system_node_pool_node_count"></a> [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. | <pre>object({<br/>    total_min_nodes = number<br/>    total_max_nodes = number<br/>  })</pre> | <pre>{<br/>  "total_max_nodes": 10,<br/>  "total_min_nodes": 2<br/>}</pre> | no |
-| <a name="input_system_node_pool_taints"></a> [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "components.gke.io/gke-managed-components",<br/>    "value": true<br/>  }<br/>]</pre> | no |
+| <a name="input_system_node_pool_node_count"></a> [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. | <pre>object({<br>    total_min_nodes = number<br>    total_max_nodes = number<br>  })</pre> | <pre>{<br>  "total_max_nodes": 10,<br>  "total_min_nodes": 2<br>}</pre> | no |
+| <a name="input_system_node_pool_taints"></a> [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "components.gke.io/gke-managed-components",<br>    "value": true<br>  }<br>]</pre> | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 480d5b7d58..698beea442 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -267,6 +267,17 @@ resource "google_container_node_pool" "system_node_pools" {
   }
 }
 
+### TODO: remove this after Terraform support for GKE Parallelstore CSI is added. ###
+###       Instead use addons_config above to enable the CSI                       ###
+resource "null_resource" "enable_parallelstore_csi" {
+  count = var.enable_parallelstore_csi == true ? 1 : 0
+
+  provisioner "local-exec" {
+    command = "gcloud container clusters update ${local.name} --location=${var.region} --project=${var.project_id} --update-addons=ParallelstoreCsiDriver=ENABLED"
+  }
+  depends_on = [google_container_node_pool.system_node_pools] # avoid cluster operation conflict
+}
+
 # For container logs to show up under Cloud Logging and GKE metrics to show up
 # on Cloud Monitoring console, some project level roles are needed for the
 # node_service_account
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index e91be6b297..a291d58a1a 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -127,6 +127,12 @@ variable "enable_persistent_disk_csi" {
   default     = true
 }
 
+variable "enable_parallelstore_csi" {
+  description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes."
+  type        = bool
+  default     = false
+}
+
 variable "system_node_pool_enabled" {
   description = "Create a system node pool."
   type        = bool
diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf
index d2fe8dd057..ad17fe1c43 100644
--- a/modules/scheduler/gke-cluster/versions.tf
+++ b/modules/scheduler/gke-cluster/versions.tf
@@ -28,6 +28,10 @@ terraform {
       source  = "hashicorp/kubernetes"
       version = "~> 2.23"
     }
+    null = {
+      source  = "hashicorp/null"
+      version = "~> 3.0"
+    }
   }
   provider_meta "google" {
     module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.40.0"
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml
new file mode 100644
index 0000000000..424908f436
--- /dev/null
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml
@@ -0,0 +1,41 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+- name: Get cluster credentials for kubectl
+  delegate_to: localhost
+  ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }}
+
+- name: Execute the job
+  delegate_to: localhost
+  ansible.builtin.shell: |
+    jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*)
+    for job in "${jobs[@]}"; do
+      kubectl create -f "$job"
+    done
+  args:
+    executable: /bin/bash
+  changed_when: False
+
+- name: Wait for job to complete
+  delegate_to: localhost
+  ansible.builtin.command: |
+    kubectl get job --field-selector  status.successful=5
+  register: job_completion
+  until: job_completion.stdout_lines | length > 1
+  retries: 40
+  delay: 15
+
+- name: Print job_completion debug output
+  ansible.builtin.debug:
+    var: job_completion.stdout_lines
diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml
new file mode 100644
index 0000000000..1a6a5873cf
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml
@@ -0,0 +1,60 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.gke-cluster
+- m.gke-job-template
+- m.gke-node-pool
+- m.gke-storage
+- m.private-service-access
+- m.vpc
+- gke
+
+timeout: 14400s  # 4hr
+
+steps:
+## Test GKE
+- id: gke-storage-parallelstore
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    SG_EXAMPLE=examples/gke-storage-parallelstore.yaml
+
+    # adding vm to act as remote node
+    echo '  - id: remote-node'                     >> $${SG_EXAMPLE}
+    echo '    source: modules/compute/vm-instance' >> $${SG_EXAMPLE}
+    echo '    use: [network]'                      >> $${SG_EXAMPLE}
+    echo '    settings:'                           >> $${SG_EXAMPLE}
+    echo '      machine_type: e2-standard-2'       >> $${SG_EXAMPLE}
+    echo '      zone: us-central1-a'               >> $${SG_EXAMPLE}
+
+    # avoids conflict with other tests
+    sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE}
+
+    IP=$(curl ifconfig.me)
+    sed -i "s/<your-ip-address>/$${IP}/" $${SG_EXAMPLE}
+
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+      --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+      --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml
new file mode 100644
index 0000000000..6a43c01ab3
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml
@@ -0,0 +1,28 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+test_name: gke-storage-parallelstore
+deployment_name: gke-storage-parallelstore-{{ build }}
+zone: us-central1-a  # for remote node
+region: us-central1
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml"
+network: "{{ deployment_name }}-net"
+remote_node: "{{ deployment_name }}-0"
+post_deploy_tests:
+- test-validation/test-gke-storage-parallelstore.yml
+custom_vars:
+  project: "{{ project }}"
+cli_deployment_vars:
+  region: "{{ region }}"

From 63a68c05aabdbdaca4c12a4021e8abb4cc7540d1 Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Tue, 8 Oct 2024 00:54:33 +0000
Subject: [PATCH 045/102] undo nccl test instruction to clean up the branch

---
 modules/compute/gke-node-pool/README.md  | 2 +-
 modules/compute/gke-node-pool/outputs.tf | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 597f28cfe0..9f86002f8c 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -295,7 +295,7 @@ limitations under the License.
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 58216e957f..8be6a2772a 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -80,14 +80,6 @@ locals {
 
     You can use the following commands to submit the sample job:
       kubectl create -f ${abspath(local.gpu_direct_setting.updated_workload_path)}
-    After submitting the sample job, you can validate the GPU performance by initiating NCCL test included in the sample workload:
-      NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods:
-      export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1)
-      export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}')
-      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS
-    Depends on the Msg size used for transmission in the test, the busbw would different a bit.
-    For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s
-    For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s
 
     If you would like to enable GPUDirect for your own workload, please follow the below steps:
       export WORKLOAD_PATH=<>

From 061ce66d7339f8694b01f08703d4c94cef6f334c Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Mon, 7 Oct 2024 12:43:27 -0700
Subject: [PATCH 046/102] Retry `wait-for-startup` script on internal error

---
 .../scripts/wait-for-startup-status.sh        | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh b/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh
index 7b957bf66b..4a231f7def 100755
--- a/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh
+++ b/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh
@@ -60,14 +60,29 @@ FINISH_LINE="startup-script exit status"
 # Match string for failures on the new guest agent
 FINISH_LINE_ERR="Script.*failed with error:"
 
+NON_FATAL_ERRORS=(
+	"Internal error"
+)
+
 until [[ now -gt deadline ]]; do
 	ser_log=$(
 		set -o pipefail
 		${fetch_cmd} 2>"${error_file}" |
 			c1grep "${FINISH_LINE}\|${FINISH_LINE_ERR}"
 	) || {
-		cat "${error_file}"
-		exit 1
+		err=$(cat "${error_file}")
+		echo "$err"
+		fatal_error="true"
+		for e in "${NON_FATAL_ERRORS[@]}"; do
+			if [[ $err = *"$e"* ]]; then
+				fatal_error="false"
+				break
+			fi
+		done
+
+		if [[ $fatal_error = "true" ]]; then
+			exit 1
+		fi
 	}
 	if [[ -n "${ser_log}" ]]; then break; fi
 	echo "Could not detect end of startup script. Sleeping."

From 1e177828603138773f2ac0625ee0fe5fec576552 Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Tue, 8 Oct 2024 02:32:49 +0000
Subject: [PATCH 047/102] upgrade local terraform-doc version

---
 .../gke-persistent-volume/variables.tf           |  2 +-
 modules/file-system/gke-storage/README.md        | 10 +++++-----
 modules/file-system/gke-storage/variables.tf     |  2 +-
 modules/scheduler/gke-cluster/README.md          | 16 ++++++++--------
 modules/scheduler/gke-cluster/variables.tf       |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf
index a72fa3857f..80e21d0b8f 100644
--- a/modules/file-system/gke-persistent-volume/variables.tf
+++ b/modules/file-system/gke-persistent-volume/variables.tf
@@ -57,6 +57,6 @@ variable "capacity_gb" {
 }
 
 variable "labels" {
-  description = "GCE resource labels to be applied to resources. Key-value pairs."
+  description = "GCE resource labels to be applied to resources. Key-value pairs. "
   type        = map(string)
 }
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index 7fbef919a4..1a63731e4c 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -107,19 +107,19 @@ No resources.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_access_mode"></a> [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)<br>Valid access modes:<br>- ReadWriteOnce<br>- ReadOnlyMany<br>- ReadWriteMany<br>- ReadWriteOncePod | `string` | n/a | yes |
+| <a name="input_access_mode"></a> [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)<br/>Valid access modes:<br/>- ReadWriteOnce<br/>- ReadOnlyMany<br/>- ReadWriteMany<br/>- ReadWriteOncePod | `string` | n/a | yes |
 | <a name="input_capacity_gb"></a> [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes |
 | <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no |
-| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.<br>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
+| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection .<br/>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br/>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_pv_mount_path"></a> [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no |
 | <a name="input_pvc_count"></a> [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no |
-| <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br>Supported value:<br>- Retain<br>- Delete | `string` | n/a | yes |
+| <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
 | <a name="input_sc_topology_zones"></a> [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no |
-| <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br>Supported value:<br>- Immediate<br>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
-| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br>to used. This module currently support dynamic provisioning for the below storage options<br>- Parallelstore<br>- Hyperdisk-balanced<br>- Hyperdisk-throughput<br>- Hyperdisk-extreme | `string` | n/a | yes |
+| <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br/>Supported value:<br/>- Immediate<br/>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
+| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore<br/>- Hyperdisk-balanced<br/>- Hyperdisk-throughput<br/>- Hyperdisk-extreme | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
index 97ff1af21b..3fd672699f 100644
--- a/modules/file-system/gke-storage/variables.tf
+++ b/modules/file-system/gke-storage/variables.tf
@@ -125,7 +125,7 @@ variable "capacity_gb" {
 
 variable "private_vpc_connection_peering" {
   description = <<-EOT
-    The name of the VPC Network peering connection.
+    The name of the VPC Network peering connection .
     If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
     If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc).
     EOT
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 554517f8d4..78974d091d 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -147,7 +147,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_authenticator_security_group"></a> [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no |
 | <a name="input_autoscaling_profile"></a> [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no |
 | <a name="input_configure_workload_identity_sa"></a> [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no |
@@ -164,9 +164,9 @@ limitations under the License.
 | <a name="input_enable_private_nodes"></a> [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no |
 | <a name="input_gcp_public_cidrs_access_enabled"></a> [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_maintenance_exclusions"></a> [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. | <pre>list(object({<br>    name            = string<br>    start_time      = string<br>    end_time        = string<br>    exclusion_scope = string<br>  }))</pre> | `[]` | no |
+| <a name="input_maintenance_exclusions"></a> [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. | <pre>list(object({<br/>    name            = string<br/>    start_time      = string<br/>    end_time        = string<br/>    exclusion_scope = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_maintenance_start_time"></a> [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no |
-| <a name="input_master_authorized_networks"></a> [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. | <pre>list(object({<br>    cidr_block   = string<br>    display_name = string<br>  }))</pre> | `[]` | no |
+| <a name="input_master_authorized_networks"></a> [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. | <pre>list(object({<br/>    cidr_block   = string<br/>    display_name = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_master_ipv4_cidr_block"></a> [master\_ipv4\_cidr\_block](#input\_master\_ipv4\_cidr\_block) | (Beta) The IP range in CIDR notation to use for the hosted master network. | `string` | `"172.16.0.32/28"` | no |
 | <a name="input_min_master_version"></a> [min\_master\_version](#input\_min\_master\_version) | The minimum version of the master. If unset, the cluster's version will be set by GKE to the version of the most recent official release. | `string` | `null` | no |
 | <a name="input_name_suffix"></a> [name\_suffix](#input\_name\_suffix) | Custom cluster name postpended to the `deployment_name`. See `prefix_with_deployment_name`. | `string` | `""` | no |
@@ -176,19 +176,19 @@ limitations under the License.
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_region"></a> [region](#input\_region) | The region to host the cluster in. | `string` | n/a | yes |
 | <a name="input_release_channel"></a> [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. | `string` | `"UNSPECIFIED"` | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the system node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
 | <a name="input_services_ip_range_name"></a> [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no |
 | <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes |
 | <a name="input_system_node_pool_enable_secure_boot"></a> [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_system_node_pool_enabled"></a> [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no |
 | <a name="input_system_node_pool_image_type"></a> [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
-| <a name="input_system_node_pool_kubernetes_labels"></a> [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_system_node_pool_kubernetes_labels"></a> [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_system_node_pool_machine_type"></a> [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no |
 | <a name="input_system_node_pool_name"></a> [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no |
-| <a name="input_system_node_pool_node_count"></a> [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. | <pre>object({<br>    total_min_nodes = number<br>    total_max_nodes = number<br>  })</pre> | <pre>{<br>  "total_max_nodes": 10,<br>  "total_min_nodes": 2<br>}</pre> | no |
-| <a name="input_system_node_pool_taints"></a> [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "components.gke.io/gke-managed-components",<br>    "value": true<br>  }<br>]</pre> | no |
+| <a name="input_system_node_pool_node_count"></a> [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. | <pre>object({<br/>    total_min_nodes = number<br/>    total_max_nodes = number<br/>  })</pre> | <pre>{<br/>  "total_max_nodes": 10,<br/>  "total_min_nodes": 2<br/>}</pre> | no |
+| <a name="input_system_node_pool_taints"></a> [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "components.gke.io/gke-managed-components",<br/>    "value": true<br/>  }<br/>]</pre> | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index a291d58a1a..4c2e049d46 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -128,7 +128,7 @@ variable "enable_persistent_disk_csi" {
 }
 
 variable "enable_parallelstore_csi" {
-  description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes."
+  description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. "
   type        = bool
   default     = false
 }

From 35ea25423bfa7bda27583d9c6df5a37b48b24ec1 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Tue, 8 Oct 2024 17:02:55 +0000
Subject: [PATCH 048/102] Added warning for v5 Slurm deployments

---
 cmd/create.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cmd/create.go b/cmd/create.go
index c20fc5f121..17ec0eb442 100644
--- a/cmd/create.go
+++ b/cmd/create.go
@@ -125,9 +125,27 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) {
 	// Expand the blueprint
 	checkErr(bp.Expand(), ctx)
 	validateMaybeDie(bp, *ctx)
+	v5DeprecationWarning(bp)
+
 	return bp, ctx
 }
 
+// TODO: Remove this warning when v5 deprecation is complete
+func v5DeprecationWarning(bp config.Blueprint) {
+	alreadyContainsV5 := false
+	bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
+		if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
+			logging.Info(boldYellow(
+				"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" +
+					"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" +
+					"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" +
+					"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
+			))
+			alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
+		}
+	})
+}
+
 // TODO: move to expand.go
 func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
 	err := validators.Execute(bp)

From 6d56d9a110c2f3a206f3547514bb78a401d53111 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Tue, 8 Oct 2024 20:22:48 +0000
Subject: [PATCH 049/102] SlurmGCP `6.8.0 -> 6.8.1`

---
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md   |  2 +-
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf     |  2 +-
 .../schedmd-slurm-gcp-v6-controller/README.md        | 12 ++++++------
 .../schedmd-slurm-gcp-v6-controller/controller.tf    |  4 ++--
 .../schedmd-slurm-gcp-v6-controller/login.tf         |  4 ++--
 .../schedmd-slurm-gcp-v6-controller/partition.tf     |  4 ++--
 .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml           |  2 +-
 .../a3-megagpu-8g/slurm-a3mega-image.yaml            |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
index 72f4fccb9f..4d790fe703 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
@@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
 
 ## Resources
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
index 7e547c3d5f..3f0ee54af8 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
@@ -56,7 +56,7 @@ locals {
 }
 
 module "slurm_nodeset_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
 
   project_id          = var.project_id
   region              = var.region
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index 30f002d68f..a9d801d8c7 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -238,13 +238,13 @@ limitations under the License.
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
 | <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
-| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 |
-| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
+| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 |
+| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
 | <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
-| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 |
-| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 |
-| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.0 |
+| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 |
+| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
+| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.1 |
 
 ## Resources
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
index 0148323597..9b105d7f39 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
@@ -43,7 +43,7 @@ locals {
 
 # INSTANCE TEMPLATE
 module "slurm_controller_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
 
   project_id          = var.project_id
   region              = var.region
@@ -99,7 +99,7 @@ locals {
 }
 
 module "slurm_controller_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1"
 
   access_config       = var.enable_controller_public_ips ? [local.access_config] : []
   add_hostname_suffix = false
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
index d9cb38ff07..de97810316 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
@@ -14,7 +14,7 @@
 
 # TEMPLATE
 module "slurm_login_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
 
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
@@ -56,7 +56,7 @@ module "slurm_login_template" {
 
 # INSTANCE
 module "slurm_login_instance" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1"
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
   access_config       = each.value.access_config
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index 9be62f82f7..849844808a 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -26,7 +26,7 @@ locals {
 # NODESET
 # TODO: remove dependency on slurm-gcp repo, move to local template module
 module "slurm_nodeset_template" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
   for_each = local.nodeset_map
 
   project_id          = var.project_id
@@ -101,7 +101,7 @@ locals {
 
 # NODESET TPU
 module "slurm_nodeset_tpu" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.0"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.1"
   for_each = local.nodeset_tpu_map
 
   project_id             = var.project_id
diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
index c50454739e..b817972331 100644
--- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
+++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
@@ -94,7 +94,7 @@ deployment_groups:
           set -e -o pipefail
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml
diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
index 6ba58f0308..67f33cde7d 100644
--- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
@@ -108,7 +108,7 @@ deployment_groups:
           apt-get install -y git
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml

From 0259f4b9fa2213a16d4299ef87df115d785fefd1 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Tue, 1 Oct 2024 20:52:49 +0000
Subject: [PATCH 050/102] add validation for multi-host tpu

---
 .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md |  6 +++---
 .../schedmd-slurm-gcp-v6-nodeset-tpu/main.tf   |  7 +++++++
 .../outputs.tf                                 | 18 +++++++++++++++++-
 .../variables.tf                               | 15 ++++++++++++---
 4 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
index fac8a63d44..8db3950334 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
@@ -63,9 +63,9 @@ No resources.
 | <a name="input_enable_public_ips"></a> [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set. <br/>If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. | <pre>list(object({<br/>    server_ip     = string,<br/>    remote_mount  = string,<br/>    local_mount   = string,<br/>    fs_type       = string,<br/>    mount_options = string,<br/>  }))</pre> | `[]` | no |
-| <a name="input_node_count_dynamic_max"></a> [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `5` | no |
-| <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no |
-| <a name="input_node_type"></a> [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | n/a | yes |
+| <a name="input_node_count_dynamic_max"></a> [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling worker nodes allowed in this partition. <br/>For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).<br/>See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. | `number` | `0` | no |
+| <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | Number of worker nodes to be statically created. <br/>For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).<br/>See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. | `number` | `0` | no |
+| <a name="input_node_type"></a> [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | `""` | no |
 | <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no |
 | <a name="input_preserve_tpu"></a> [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `false` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf
index c4e7a08043..ac9b119702 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf
@@ -49,4 +49,11 @@ locals {
     reserved        = var.reserved
     network_storage = var.network_storage
   }
+
+  node_type_core_count = var.node_type == "" ? 0 : tonumber(regex("-(.*)", var.node_type)[0])
+
+  accelerator_core_list  = var.accelerator_config.topology == "" ? [0, 0] : regexall("\\d+", var.accelerator_config.topology)
+  accelerator_core_count = length(local.accelerator_core_list) > 2 ? (local.accelerator_core_list[0] * local.accelerator_core_list[1] * local.accelerator_core_list[2]) * 2 : (local.accelerator_core_list[0] * local.accelerator_core_list[1]) * 2
+
+  tpu_core_count = local.accelerator_core_count == 0 ? local.node_type_core_count : local.accelerator_core_count
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf
index 280264f467..8cb7b8663e 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf
@@ -17,7 +17,23 @@ output "nodeset_tpu" {
   value       = local.nodeset_tpu
 
   precondition {
-    condition     = (var.node_type == null) != (var.accelerator_config == { topology : "", version : "" })
+    condition     = (var.node_type == "") != (var.accelerator_config == { topology : "", version : "" })
     error_message = "Either a node_type or an accelerator_config must be provided."
   }
+
+  precondition {
+    condition     = ((local.tpu_core_count / 8) <= var.node_count_dynamic_max) || ((local.tpu_core_count / 8) <= var.node_count_static)
+    error_message = <<-EOD
+      When using TPUs there should be at least one node per every 8 cores. 
+      Currently there are ${local.tpu_core_count} cores but only ${var.node_count_static} static nodes and ${var.node_count_dynamic_max} dynamic nodes.
+    EOD
+  }
+
+  precondition {
+    condition     = (var.node_count_dynamic_max % (local.tpu_core_count / 8) == 0) && (var.node_count_static % (local.tpu_core_count / 8) == 0)
+    error_message = <<-EOD
+      The number of worker nodes should be a multiple of ${local.tpu_core_count / 8}.
+      This is to ensure each node has a TPU machine for job scheduling.
+    EOD
+  }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
index 30e8d5c177..3302e0ea4c 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf
@@ -13,15 +13,23 @@
 # limitations under the License.
 
 variable "node_count_static" {
-  description = "Number of nodes to be statically created."
+  description = <<-EOD
+    Number of worker nodes to be statically created. 
+    For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).
+    See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types.
+    EOD
   type        = number
   default     = 0
 }
 
 variable "node_count_dynamic_max" {
-  description = "Maximum number of auto-scaling nodes allowed in this partition."
+  description = <<-EOD
+    Maximum number of auto-scaling worker nodes allowed in this partition. 
+    For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).
+    See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types.
+    EOD
   type        = number
-  default     = 5
+  default     = 0
 }
 
 variable "name" {
@@ -51,6 +59,7 @@ variable "disable_public_ips" { # tflint-ignore: terraform_unused_declarations
 variable "node_type" {
   description = "Specify a node type to base the vm configuration upon it."
   type        = string
+  default     = ""
 }
 
 variable "accelerator_config" {

From a1ddebd42171e97099f05e45faf6bd1326f4ae5d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 21:29:33 +0000
Subject: [PATCH 051/102] Bump django from 4.2.15 to 4.2.16 in
 /community/front-end/ofe

Bumps [django](https://github.com/django/django) from 4.2.15 to 4.2.16.
- [Commits](https://github.com/django/django/compare/4.2.15...4.2.16)

---
updated-dependencies:
- dependency-name: django
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 community/front-end/ofe/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt
index 26756d670c..a9a4a047b4 100644
--- a/community/front-end/ofe/requirements.txt
+++ b/community/front-end/ofe/requirements.txt
@@ -19,7 +19,7 @@ dill==0.3.6
 distlib==0.3.6
 # django-revproxy==0.11.0 released but not yet in pypi
 git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787
-Django==4.2.15
+Django==4.2.16
 django-allauth==0.54.0
 django-extensions==3.2.3
 djangorestframework==3.15.2

From 4bb7a2cbb22abf1d3d657e2af72ae005eddd29a3 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Wed, 9 Oct 2024 01:08:28 +0000
Subject: [PATCH 052/102] Support for template files as config added to kueue
 installation

---
 modules/management/kubectl-apply/README.md    | 2 +-
 modules/management/kubectl-apply/main.tf      | 7 ++++---
 modules/management/kubectl-apply/variables.tf | 9 +++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index bd91e424dc..e2fbe50b65 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -119,7 +119,7 @@ limitations under the License.
 | <a name="input_apply_manifests"></a> [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). | <pre>list(object({<br/>    content           = optional(string, null)<br/>    source            = optional(string, null)<br/>    template_vars     = optional(map(any), null)<br/>    server_side_apply = optional(bool, false)<br/>    wait_for_rollout  = optional(bool, true)<br/>  }))</pre> | `[]` | no |
 | <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
 | <a name="input_jobset"></a> [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. | <pre>object({<br/>    install = optional(bool, false)<br/>    version = optional(string, "v0.5.2")<br/>  })</pre> | `{}` | no |
-| <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. | <pre>object({<br/>    install     = optional(bool, false)<br/>    version     = optional(string, "v0.8.1")<br/>    config_path = optional(string, null)<br/>  })</pre> | `{}` | no |
+| <a name="input_kueue"></a> [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. | <pre>object({<br/>    install              = optional(bool, false)<br/>    version              = optional(string, "v0.8.1")<br/>    config_path          = optional(string, null)<br/>    config_template_vars = optional(map(any), null)<br/>  })</pre> | `{}` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes |
 
 ## Outputs
diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf
index dd68be57f6..5663e01580 100644
--- a/modules/management/kubectl-apply/main.tf
+++ b/modules/management/kubectl-apply/main.tf
@@ -77,9 +77,10 @@ module "install_jobset" {
 }
 
 module "configure_kueue" {
-  source      = "./kubectl"
-  source_path = local.install_kueue ? try(var.kueue.config_path, "") : null
-  depends_on  = [module.install_kueue]
+  source        = "./kubectl"
+  source_path   = local.install_kueue ? try(var.kueue.config_path, "") : null
+  template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null
+  depends_on    = [module.install_kueue]
 
   server_side_apply = true
   wait_for_rollout  = true
diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf
index e0dd6430f5..2e0a36603d 100644
--- a/modules/management/kubectl-apply/variables.tf
+++ b/modules/management/kubectl-apply/variables.tf
@@ -38,11 +38,12 @@ variable "apply_manifests" {
 }
 
 variable "kueue" {
-  description = "Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler."
+  description = "Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config_path to be applied right after kueue installation. If a template file provided, its variables can be set to config_template_vars."
   type = object({
-    install     = optional(bool, false)
-    version     = optional(string, "v0.8.1")
-    config_path = optional(string, null)
+    install              = optional(bool, false)
+    version              = optional(string, "v0.8.1")
+    config_path          = optional(string, null)
+    config_template_vars = optional(map(any), null)
   })
   default = {}
 

From 102d0c0216f7807da6dfdabfaddf098dd7cad3ce Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Wed, 9 Oct 2024 01:20:31 +0000
Subject: [PATCH 053/102] Fix static-check.

---
 pkg/config/config.go     | 5 +----
 pkg/config/errors.go     | 1 +
 pkg/config/expression.go | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pkg/config/config.go b/pkg/config/config.go
index ef8e8e2290..df2192291f 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -36,10 +36,7 @@ import (
 )
 
 const (
-	expectedVarFormat        string = "$(vars.var_name) or $(module_id.output_name)"
-	expectedModFormat        string = "$(module_id) or $(group_id.module_id)"
-	unexpectedConnectionKind string = "connectionKind must be useConnection or deploymentConnection"
-	maxHintDist              int    = 3 // Maximum Levenshtein distance where we suggest a hint
+	maxHintDist int = 3 // Maximum Levenshtein distance where we suggest a hint
 )
 
 // map[moved module path]replacing module path
diff --git a/pkg/config/errors.go b/pkg/config/errors.go
index d415602f08..1dd976260d 100644
--- a/pkg/config/errors.go
+++ b/pkg/config/errors.go
@@ -157,6 +157,7 @@ var UnknownModuleSetting = errors.New("a setting was added that is not found in
 var ModuleSettingWithPeriod = errors.New("a setting name contains a period, which is not supported; variable subfields cannot be set independently in a blueprint.")
 var ModuleSettingInvalidChar = errors.New("a setting name must begin with a non-numeric character and all characters must be either letters, numbers, dashes ('-') or underscores ('_').")
 var EmptyGroupName = errors.New("group name must be set for each deployment group")
+var UnexpectedRefFormat = errors.New("Expected reference formats: $(vars.var_name) or $(module_id.output_name)")
 
 // Error messages
 const (
diff --git a/pkg/config/expression.go b/pkg/config/expression.go
index 68512ec344..3cfeb096d1 100644
--- a/pkg/config/expression.go
+++ b/pkg/config/expression.go
@@ -69,11 +69,11 @@ func (r Reference) String() string {
 // and transforms it to "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`).
 func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) {
 	if len(t) < 2 {
-		return nil, fmt.Errorf(expectedVarFormat)
+		return nil, UnexpectedRefFormat
 	}
 	_, ok := t[1].(hcl.TraverseAttr)
 	if !ok {
-		return nil, fmt.Errorf(expectedVarFormat)
+		return nil, UnexpectedRefFormat
 	}
 
 	if t.RootName() == "vars" {

From fc4d0ed61f1dd992676ac1c898f827e48d2a9e17 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Wed, 9 Oct 2024 15:05:36 +0000
Subject: [PATCH 054/102] document updated

---
 modules/management/kubectl-apply/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md
index e2fbe50b65..3e3bebecc0 100644
--- a/modules/management/kubectl-apply/README.md
+++ b/modules/management/kubectl-apply/README.md
@@ -58,6 +58,21 @@ This module simplifies the following functionality:
         install: true
 ```
 
+The `config_path` field in `kueue` installation accepts a template file, too. You will need to provide variables for the template using `config_template_vars` field.
+
+```yaml
+  - id: workload_component_install
+    source: modules/management/kubectl-apply
+    use: [gke_cluster]
+    settings:
+      kueue:
+        install: true
+        config_path: $(ghpc_stage("manifests/user-provided-kueue-config.yaml.tftpl"))
+        config_template_vars: {name: "dev-config", public: "false"}
+      jobset:
+        install: true
+```
+
 > **_NOTE:_**
 >
 > The `project_id` and `region` settings would be inferred from the deployment variables of the same name, but they are included here for clarity.

From 533e0808915807c90a42fcbc0c5eec56bc99fc2e Mon Sep 17 00:00:00 2001
From: Rachael Tamakloe <rtamakloe@google.com>
Date: Tue, 8 Oct 2024 19:30:22 +0000
Subject: [PATCH 055/102] upgrading tpg from 5.x to 6.x

---
 pkg/config/expand.go                                      | 4 ++--
 pkg/config/expand_test.go                                 | 4 ++--
 .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml       | 4 ++--
 .../golden_copies/expectations/igc_pkr/zero/versions.tf   | 4 ++--
 .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml        | 8 ++++----
 .../golden_copies/expectations/igc_tf/one/versions.tf     | 4 ++--
 .../golden_copies/expectations/igc_tf/zero/versions.tf    | 4 ++--
 .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++--
 .../expectations/merge_flatten/zero/versions.tf           | 4 ++--
 .../.ghpc/artifacts/expanded_blueprint.yaml               | 4 ++--
 .../expectations/versioned_blueprint/primary/versions.tf  | 4 ++--
 11 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index 3a8898306d..a58ce74a41 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 	return map[string]TerraformProvider{
 		"google": {
 			Source:        "hashicorp/google",
-			Version:       ">= 4.84.0, < 5.45.0",
+			Version:       ">= 4.84.0, < 6.7.0",
 			Configuration: gglConf},
 		"google-beta": {
 			Source:        "hashicorp/google-beta",
-			Version:       ">= 4.84.0, < 5.45.0",
+			Version:       ">= 4.84.0, < 6.7.0",
 			Configuration: gglConf}}
 }
 
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index 40fc192175..59495832d4 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		c.Check(g.TerraformProviders, DeepEquals, map[string]PR{
 			"google": TerraformProvider{
 				Source:  "hashicorp/google",
-				Version: ">= 4.84.0, < 5.45.0"},
+				Version: ">= 4.84.0, < 6.7.0"},
 			"google-beta": TerraformProvider{
 				Source:  "hashicorp/google-beta",
-				Version: ">= 4.84.0, < 5.45.0"}})
+				Version: ">= 4.84.0, < 6.7.0"}})
 	}
 
 	{ // no def PR, group PR
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
index 32d7d818a8..ba265ba2ee 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
@@ -38,14 +38,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
index 6630b9b8c6..3534fd124e 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
index 8a160967a2..5736fbba16 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
@@ -44,14 +44,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
@@ -79,14 +79,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
index 6630b9b8c6..3534fd124e 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
index 6630b9b8c6..3534fd124e 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
index 9c97a650eb..c21a1bb32f 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
@@ -39,14 +39,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
index 6630b9b8c6..3534fd124e 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
index 4e74f8d305..ad79aee614 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
@@ -47,14 +47,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 5.45.0'
+        version: '>= 4.84.0, < 6.7.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
index 6630b9b8c6..3534fd124e 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 5.45.0"
+      version = ">= 4.84.0, < 6.7.0"
     }
   }
 }

From 8a050d72e181f49ec70f6300662a01cf6401578e Mon Sep 17 00:00:00 2001
From: Carson Dunbar <carsondunbar@google.com>
Date: Thu, 12 Sep 2024 15:55:57 +0000
Subject: [PATCH 056/102] Initial commit for gke-a3megagpu integration test

---
 modules/compute/gke-node-pool/README.md       |  22 ++--
 .../compute/gke-node-pool/disk_definitions.tf |   4 +-
 .../gke-integration-test.yml                  | 118 ++++++++++++++++++
 .../daily-tests/builds/gke-a3-megagpu.yaml    |  66 ++++++++++
 .../daily-tests/tests/gke-a3-megagpu.yml      |  43 +++++++
 5 files changed, 240 insertions(+), 13 deletions(-)
 create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml
 create mode 100644 tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 03652cf29e..9f86002f8c 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -295,26 +295,26 @@ limitations under the License.
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf
index f7dbebea0a..b5933bf316 100644
--- a/modules/compute/gke-node-pool/disk_definitions.tf
+++ b/modules/compute/gke-node-pool/disk_definitions.tf
@@ -22,8 +22,8 @@
 locals {
 
   local_ssd_machines = {
-    "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null },
-    "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null },
+    "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 },
+    "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 },
   }
 
   generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null })
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml
new file mode 100644
index 0000000000..ac3e5a3deb
--- /dev/null
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml
@@ -0,0 +1,118 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+- name: "Setup Integration tests for Cluster Toolkit"
+  hosts: localhost
+  tasks:
+  ## Create SSH Keys
+  - name: "Create .ssh folder"
+    ansible.builtin.file:
+      path: "/builder/home/.ssh"
+      state: directory
+      mode: 0700
+
+  - name: Create SSH Key
+    community.crypto.openssh_keypair:
+      path: "/builder/home/.ssh/id_rsa"
+
+  ## Get builder IP address
+  - name: Get Builder IP
+    register: build_ip
+    changed_when: false
+    args:
+      executable: /bin/bash
+    ansible.builtin.shell: |
+      set -e -o pipefail
+      dig TXT +short o-o.myaddr.l.google.com @ns1.google.com | \
+          awk -F'"' '{print $2}'
+
+  ## Create cluster
+  - name: Create Deployment Directory
+    ansible.builtin.include_tasks:
+      file: tasks/create_deployment_directory.yml
+
+  - name: Create Infrastructure and test
+    block:
+    - name: Create Cluster with gcluster
+      register: deployment
+      changed_when: deployment.changed
+      ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve
+      args:
+        chdir: "{{ workspace }}"
+      environment:
+        TF_IN_AUTOMATION: "TRUE"
+
+    ## Cleanup and fail gracefully
+    rescue:
+    - name: Capture gcluster stderr
+      failed_when: false
+      ansible.builtin.set_fact:
+        gcluster_stderr: "{{ deployment.stderr | replace('\n',' ') }}"
+
+    - name: Gather logs
+      ansible.builtin.include_tasks:
+        file: tasks/gather_startup_script_logs.yml
+        apply:
+          delegate_to: localhost
+
+    - name: Include rescue from gcluster failure
+      ansible.builtin.include_tasks:
+        file: tasks/rescue_gcluster_failure.yml
+        apply:
+          delegate_to: localhost
+      vars:
+        deployment_name: "{{ deployment_name }}"
+        workspace: "{{ workspace }}"
+
+    - name: Trigger failure (rescue blocks otherwise revert failures)
+      ansible.builtin.fail:
+        msg: "Failed while setting up test infrastructure"
+
+- name: Run Integration Tests
+  hosts: remote_host
+  vars:
+    startup_timeout_seconds: 600  # 10 minutes
+  gather_facts: false
+  tasks:
+  - name: Remote Test Block
+    vars:
+      ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa"
+
+    block:
+    - name: Include wait for startup script
+      ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml"
+      vars:
+        timeout_seconds: "{{ startup_timeout_seconds }}"
+
+    - name: Run Integration tests for Cluster Toolkit
+      ansible.builtin.include_tasks: "{{ test }}"
+      vars:
+        remote_node: "{{ remote_node }}"
+        deployment_name: "{{ deployment_name }}"
+        custom_vars: "{{ custom_vars }}"
+      loop: "{{ post_deploy_tests }}"
+      loop_control:
+        loop_var: test
+
+    always:
+    - name: Cleanup firewall and infrastructure
+      ansible.builtin.include_tasks:
+        file: tasks/rescue_gcluster_failure.yml
+        apply:
+          delegate_to: localhost
+      vars:
+        deployment_name: "{{ deployment_name }}"
+        workspace: "{{ workspace }}"
diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
new file mode 100644
index 0000000000..fc16863abd
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
@@ -0,0 +1,66 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.gke-cluster
+- m.gke-node-pool
+- m.vpc
+- m.multivpc
+- m.kubectl-apply
+- gke
+
+timeout: 14400s  # 4hr
+steps:
+- id: gke-a3-megagpu
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    EXAMPLE_BP=examples/gke-a3-megagpu.yaml
+
+    # Replacing the static subnet name to prevent collisions
+    sed  -i "s/gke-subnet-a3-mega/gke-subnet-a3-mega-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP}
+
+    # adding vm to act as remote node
+    echo '  - id: remote-node'                           >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/vm-instance'       >> $${EXAMPLE_BP}
+    echo '    use: [network1]'                           >> $${EXAMPLE_BP}
+    echo '    settings:'                                 >> $${EXAMPLE_BP}
+    echo '      machine_type: e2-standard-2'             >> $${EXAMPLE_BP}
+    echo '      name_prefix: remote-node'                >> $${EXAMPLE_BP}
+    echo '      add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
+    echo ''
+    echo '  - id: job_template_hostname'                       >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/gke-job-template'        >> $${EXAMPLE_BP}
+    echo '    use: [a3_megagpu_pool]'                          >> $${EXAMPLE_BP}
+    echo '    settings:'                                       >> $${EXAMPLE_BP}
+    echo '      name: job-a3mega-test'                         >> $${EXAMPLE_BP}
+    echo '      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
+    echo '      command:'                                      >> $${EXAMPLE_BP}
+    echo '      - nvidia-smi'                                  >> $${EXAMPLE_BP}
+    echo '      node_count: 1'                                 >> $${EXAMPLE_BP}
+    echo '    outputs: [instructions]'                         >> $${EXAMPLE_BP}
+
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+        --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+        --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml
new file mode 100644
index 0000000000..6b305c3410
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml
@@ -0,0 +1,43 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# region, zone must be defined
+# in build file with --extra-vars flag!
+test_name: gke-a3mega
+deployment_name: gke-a3mega-{{ build }}
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/gke-a3-megagpu.yaml"
+network: "gke-a3mega-net-{{ build }}"
+region: us-west4
+zone: us-west4-a
+remote_node: "{{ deployment_name }}-remote-node-0"
+reservation_affinity:
+  consume_reservation_type: SPECIFIC_RESERVATION
+  specific_reservations:
+  - name: a3mega-reservation-0
+    project: "{{ project }}"
+cli_deployment_vars:
+  region: "{{ region }}"
+  zone: "{{ zone }}"
+  reservation_affinity: "{{ reservation_affinity }}"
+  autoscaling_total_max_nodes: 2
+  authorized_cidr: "{{ build_ip.stdout }}/32"
+  network_name: "{{ network }}"
+  local_ssd_count_nvme_block: 16
+custom_vars:
+  project: "{{ project }}"
+post_deploy_tests:
+- test-validation/test-gke-job.yml

From 93d7af0cddfc86ebe1c0b887f73b64e364259377 Mon Sep 17 00:00:00 2001
From: Carson Dunbar <carsondunbar@google.com>
Date: Wed, 9 Oct 2024 20:05:51 +0000
Subject: [PATCH 057/102] Integration test now working

---
 modules/compute/gke-node-pool/README.md       |  22 ++--
 .../compute/gke-node-pool/disk_definitions.tf |   4 +-
 .../gke-integration-test.yml                  | 118 ------------------
 .../test-validation/test-gke-job.yml          |   4 +-
 .../daily-tests/builds/gke-a3-megagpu.yaml    |   1 -
 5 files changed, 15 insertions(+), 134 deletions(-)
 delete mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 9f86002f8c..03652cf29e 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -295,26 +295,26 @@ limitations under the License.
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf
index b5933bf316..f7dbebea0a 100644
--- a/modules/compute/gke-node-pool/disk_definitions.tf
+++ b/modules/compute/gke-node-pool/disk_definitions.tf
@@ -22,8 +22,8 @@
 locals {
 
   local_ssd_machines = {
-    "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 },
-    "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 },
+    "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null },
+    "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null },
   }
 
   generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null })
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml
deleted file mode 100644
index ac3e5a3deb..0000000000
--- a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
----
-
-- name: "Setup Integration tests for Cluster Toolkit"
-  hosts: localhost
-  tasks:
-  ## Create SSH Keys
-  - name: "Create .ssh folder"
-    ansible.builtin.file:
-      path: "/builder/home/.ssh"
-      state: directory
-      mode: 0700
-
-  - name: Create SSH Key
-    community.crypto.openssh_keypair:
-      path: "/builder/home/.ssh/id_rsa"
-
-  ## Get builder IP address
-  - name: Get Builder IP
-    register: build_ip
-    changed_when: false
-    args:
-      executable: /bin/bash
-    ansible.builtin.shell: |
-      set -e -o pipefail
-      dig TXT +short o-o.myaddr.l.google.com @ns1.google.com | \
-          awk -F'"' '{print $2}'
-
-  ## Create cluster
-  - name: Create Deployment Directory
-    ansible.builtin.include_tasks:
-      file: tasks/create_deployment_directory.yml
-
-  - name: Create Infrastructure and test
-    block:
-    - name: Create Cluster with gcluster
-      register: deployment
-      changed_when: deployment.changed
-      ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve
-      args:
-        chdir: "{{ workspace }}"
-      environment:
-        TF_IN_AUTOMATION: "TRUE"
-
-    ## Cleanup and fail gracefully
-    rescue:
-    - name: Capture gcluster stderr
-      failed_when: false
-      ansible.builtin.set_fact:
-        gcluster_stderr: "{{ deployment.stderr | replace('\n',' ') }}"
-
-    - name: Gather logs
-      ansible.builtin.include_tasks:
-        file: tasks/gather_startup_script_logs.yml
-        apply:
-          delegate_to: localhost
-
-    - name: Include rescue from gcluster failure
-      ansible.builtin.include_tasks:
-        file: tasks/rescue_gcluster_failure.yml
-        apply:
-          delegate_to: localhost
-      vars:
-        deployment_name: "{{ deployment_name }}"
-        workspace: "{{ workspace }}"
-
-    - name: Trigger failure (rescue blocks otherwise revert failures)
-      ansible.builtin.fail:
-        msg: "Failed while setting up test infrastructure"
-
-- name: Run Integration Tests
-  hosts: remote_host
-  vars:
-    startup_timeout_seconds: 600  # 10 minutes
-  gather_facts: false
-  tasks:
-  - name: Remote Test Block
-    vars:
-      ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa"
-
-    block:
-    - name: Include wait for startup script
-      ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml"
-      vars:
-        timeout_seconds: "{{ startup_timeout_seconds }}"
-
-    - name: Run Integration tests for Cluster Toolkit
-      ansible.builtin.include_tasks: "{{ test }}"
-      vars:
-        remote_node: "{{ remote_node }}"
-        deployment_name: "{{ deployment_name }}"
-        custom_vars: "{{ custom_vars }}"
-      loop: "{{ post_deploy_tests }}"
-      loop_control:
-        loop_var: test
-
-    always:
-    - name: Cleanup firewall and infrastructure
-      ansible.builtin.include_tasks:
-        file: tasks/rescue_gcluster_failure.yml
-        apply:
-          delegate_to: localhost
-      vars:
-        deployment_name: "{{ deployment_name }}"
-        workspace: "{{ workspace }}"
diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml
index f1be62e220..44be3ac853 100644
--- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml
+++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml
@@ -15,12 +15,12 @@
 - name: Assert variables are defined
   ansible.builtin.assert:
     that:
-    - cli_deployment_vars.region is defined
+    - region is defined
     - custom_vars.project is defined
 
 - name: Get cluster credentials for kubectl
   delegate_to: localhost
-  ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }}
+  ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ region }} --project {{ custom_vars.project }}
 
 - name: Execute the job
   delegate_to: localhost
diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
index fc16863abd..118704e7ea 100644
--- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
@@ -54,7 +54,6 @@ steps:
     echo '    source: modules/compute/gke-job-template'        >> $${EXAMPLE_BP}
     echo '    use: [a3_megagpu_pool]'                          >> $${EXAMPLE_BP}
     echo '    settings:'                                       >> $${EXAMPLE_BP}
-    echo '      name: job-a3mega-test'                         >> $${EXAMPLE_BP}
     echo '      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
     echo '      command:'                                      >> $${EXAMPLE_BP}
     echo '      - nvidia-smi'                                  >> $${EXAMPLE_BP}

From 5cb64acebcfb136ddbeba2b6919e2677f1aab806 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 9 Oct 2024 20:47:42 +0000
Subject: [PATCH 058/102] Free slurm-gcp v5 hybrid blueprints with the latest
 cluster toolkit version support

---
 community/examples/tutorial-starccm-slurm.yaml                 | 2 ++
 docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
index d9ad22d1a7..ce8dd0817f 100644
--- a/community/examples/tutorial-starccm-slurm.yaml
+++ b/community/examples/tutorial-starccm-slurm.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: starccm-on-slurm
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.40.0
 
 vars:
   project_id:  ## Set GCP Project ID Here ##
diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
index 45312348ed..0220352d35 100644
--- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: hpc-cluster-hybrid-v5
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.40.0
 
 vars:
   project_id:  ## <<bursting project (Project B)>>

From 6e684c1eb9377a6b1d21bd01d8eaae9df1530dd5 Mon Sep 17 00:00:00 2001
From: Rachael Tamakloe <rtamakloe@google.com>
Date: Wed, 9 Oct 2024 21:28:37 +0000
Subject: [PATCH 059/102] updating docs for v_blueprint feature

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ec386515e6..0275fd930c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1772,7 +1772,7 @@ scratch.
 ---
 blueprint_name: # boilerplate-blueprint
 toolkit_modules_url: # github.com/GoogleCloudPlatform/cluster-toolkit
-toolkit_modules_version: # v1.15.0
+toolkit_modules_version: # v1.38.0
 
 vars:
   project_id: # my-project-id
@@ -1796,7 +1796,7 @@ deployment_groups:
    63 characters long, and can only contain lowercase letters, numeric
    characters, underscores and dashes.
 
-* **toolkit_modules_url** and **toolkit_modules_version** (optional): The blueprint schema provides the optional fields `toolkit_modules_url` and `toolkit_modules_version` to version a blueprint. When these fields are provided, any module in the blueprint with a reference to an embedded module in its source field will be updated to reference the specified GitHub source and toolkit version in the expanded blueprint. `toolkit_modules_url` specifies the base URL of the GitHub repository containing the modules and `toolkit_modules_version` specifies the version of the modules to use. `toolkit_modules_url` and `toolkit_modules_version` should be provided together when in use.
+* **toolkit_modules_url** and **toolkit_modules_version** (optional): The blueprint schema provides the optional fields `toolkit_modules_url` and `toolkit_modules_version` to version a blueprint. When these fields are provided, any module in the blueprint with a reference to an embedded module in its source field will be updated to reference the specified GitHub source and toolkit version in the deployment folder. `toolkit_modules_url` specifies the base URL of the GitHub repository containing the modules and `toolkit_modules_version` specifies the version of the modules to use. `toolkit_modules_url` and `toolkit_modules_version` should be provided together when in use.
 
 ### Deployment Variables
 

From 0fed0411ee26426d977c9ef569c08fc70b48b5c1 Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Thu, 10 Oct 2024 00:13:26 +0000
Subject: [PATCH 060/102] update version info

---
 modules/file-system/gke-persistent-volume/variables.tf | 2 +-
 modules/file-system/gke-storage/README.md              | 2 +-
 modules/file-system/gke-storage/variables.tf           | 2 +-
 modules/file-system/gke-storage/versions.tf            | 2 +-
 modules/scheduler/gke-cluster/variables.tf             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf
index 80e21d0b8f..a72fa3857f 100644
--- a/modules/file-system/gke-persistent-volume/variables.tf
+++ b/modules/file-system/gke-persistent-volume/variables.tf
@@ -57,6 +57,6 @@ variable "capacity_gb" {
 }
 
 variable "labels" {
-  description = "GCE resource labels to be applied to resources. Key-value pairs. "
+  description = "GCE resource labels to be applied to resources. Key-value pairs."
   type        = map(string)
 }
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index 1a63731e4c..c578a4a0d8 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -112,7 +112,7 @@ No resources.
 | <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_mount_options"></a> [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no |
-| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection .<br/>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br/>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
+| <a name="input_private_vpc_connection_peering"></a> [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.<br/>If using new VPC, please use community/modules/network/private-service-access to create private-service-access and<br/>If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_pv_mount_path"></a> [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no |
 | <a name="input_pvc_count"></a> [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no |
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
index 3fd672699f..97ff1af21b 100644
--- a/modules/file-system/gke-storage/variables.tf
+++ b/modules/file-system/gke-storage/variables.tf
@@ -125,7 +125,7 @@ variable "capacity_gb" {
 
 variable "private_vpc_connection_peering" {
   description = <<-EOT
-    The name of the VPC Network peering connection .
+    The name of the VPC Network peering connection.
     If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
     If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc).
     EOT
diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf
index 0a1082c515..78d62b235d 100644
--- a/modules/file-system/gke-storage/versions.tf
+++ b/modules/file-system/gke-storage/versions.tf
@@ -16,6 +16,6 @@ terraform {
   required_version = ">= 1.0"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.39.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.40.0"
   }
 }
diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf
index 4c2e049d46..a291d58a1a 100644
--- a/modules/scheduler/gke-cluster/variables.tf
+++ b/modules/scheduler/gke-cluster/variables.tf
@@ -128,7 +128,7 @@ variable "enable_persistent_disk_csi" {
 }
 
 variable "enable_parallelstore_csi" {
-  description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. "
+  description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes."
   type        = bool
   default     = false
 }

From 8e8656f64bb5f5cabb2fc5fbbb310178aa998475 Mon Sep 17 00:00:00 2001
From: ChengcongDu <chdu@google.com>
Date: Thu, 10 Oct 2024 21:57:23 +0000
Subject: [PATCH 061/102] upgrade local tf-doc version and redo doc gen

---
 modules/compute/gke-node-pool/outputs.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 7bcd0c6361..77088a7641 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -84,7 +84,7 @@ locals {
       NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods:
       export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1)
       export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}')
-      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS
+      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS 
 
     If you would like to enable GPUDirect for your own workload, please follow the below steps:
       export WORKLOAD_PATH=<>

From 756908116ce8536bc5e6547b1dff95ac2f044ddb Mon Sep 17 00:00:00 2001
From: ChengcongDu <chdu@google.com>
Date: Thu, 10 Oct 2024 21:58:04 +0000
Subject: [PATCH 062/102] upgrade local tf-doc version and redo doc gen

---
 modules/compute/gke-node-pool/outputs.tf   | 2 +-
 modules/compute/gke-node-pool/variables.tf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 77088a7641..7bcd0c6361 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -84,7 +84,7 @@ locals {
       NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods:
       export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1)
       export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}')
-      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS 
+      kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS
 
     If you would like to enable GPUDirect for your own workload, please follow the below steps:
       export WORKLOAD_PATH=<>
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index ef1277744f..825c1c72d0 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -15,7 +15,7 @@
 */
 
 variable "project_id" {
-  description = "The project ID to host the cluster in."
+  description = "The project ID to host the cluster in. "
   type        = string
 }
 

From 088a9a749508ea7d06c08b4c68fb6b32c444d8ac Mon Sep 17 00:00:00 2001
From: ChengcongDu <chdu@google.com>
Date: Thu, 10 Oct 2024 22:43:39 +0000
Subject: [PATCH 063/102] upgrade local tf-doc version and redo doc gen

---
 modules/compute/gke-node-pool/README.md    | 22 +++++++++++-----------
 modules/compute/gke-node-pool/variables.tf |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 78c4dd1dd7..fcf7414af6 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -284,7 +284,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -294,26 +294,26 @@ limitations under the License.
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no |
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
+| <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
-| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br>    type = string<br>    name = optional(string)<br>  })</pre> | <pre>{<br>  "name": null,<br>  "type": null<br>}</pre> | no |
+| <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index 825c1c72d0..ef1277744f 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -15,7 +15,7 @@
 */
 
 variable "project_id" {
-  description = "The project ID to host the cluster in. "
+  description = "The project ID to host the cluster in."
   type        = string
 }
 

From c09fe01595314cec275e8beec8bcfc11a542dcfc Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Tue, 8 Oct 2024 19:32:59 +0000
Subject: [PATCH 064/102] Implement xpk-gke-a3-megagpu blueprint

---
 .../config-map.yaml.tftpl                     |   6 +
 .../kueue-credentials.yaml.tftpl              |  73 +++++++++++
 community/examples/xpk-gke-a3-megagpu.yaml    | 118 ++++++++++++++++++
 .../manifests/schedule-daemon.yaml            |   4 +
 .../{resource-policy => }/README.md           |   0
 .../{resource-policy => }/main.tf             |   0
 .../{resource-policy => }/metadata.yaml       |   0
 .../{resource-policy => }/outputs.tf          |   0
 .../{resource-policy => }/variables.tf        |   0
 .../{resource-policy => }/versions.tf         |   0
 10 files changed, 201 insertions(+)
 create mode 100644 community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
 create mode 100644 community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl
 create mode 100644 community/examples/xpk-gke-a3-megagpu.yaml
 rename modules/compute/resource-policy/{resource-policy => }/README.md (100%)
 rename modules/compute/resource-policy/{resource-policy => }/main.tf (100%)
 rename modules/compute/resource-policy/{resource-policy => }/metadata.yaml (100%)
 rename modules/compute/resource-policy/{resource-policy => }/outputs.tf (100%)
 rename modules/compute/resource-policy/{resource-policy => }/variables.tf (100%)
 rename modules/compute/resource-policy/{resource-policy => }/versions.tf (100%)

diff --git a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
new file mode 100644
index 0000000000..100058b7be
--- /dev/null
+++ b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
@@ -0,0 +1,6 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${name}
+data:
+  h100-mega-80gb-8: ${num_nodes}
diff --git a/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl
new file mode 100644
index 0000000000..326cea0e54
--- /dev/null
+++ b/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl
@@ -0,0 +1,73 @@
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ResourceFlavor
+metadata:
+  name: 1xh100-mega-80gb-8
+spec:
+  nodeLabels:
+    cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
+---
+
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: cluster-queue
+spec:
+  preemption:
+      reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
+      withinClusterQueue: LowerPriority
+  namespaceSelector: {} # match all.
+  resourceGroups:
+  - coveredResources: ["nvidia.com/gpu"]
+    flavors:
+    - name: 1xh100-mega-80gb-8
+      resources:
+      - name: "nvidia.com/gpu"
+        nominalQuota: ${num_chips}
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: default
+  name: multislice-queue
+spec:
+  clusterQueue: cluster-queue
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: very-low
+value: 100
+globalDefault: false
+description: "Very Low"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: low
+value: 250
+globalDefault: false
+description: "Low"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: medium
+value: 500
+globalDefault: false
+description: "Medium"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: high
+value: 750
+globalDefault: false
+description: "High"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: very-high
+value: 1000
+globalDefault: false
+description: "Very High"
diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
new file mode 100644
index 0000000000..999a52ea7b
--- /dev/null
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -0,0 +1,118 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: xpk-gke-a3-megagpu
+
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: xpk-gke-a3-megagpu
+  region: us-central1
+  zone: us-central1-c
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: <your-ip-address>/32
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network1
+    source: modules/network/vpc
+    settings:
+      subnetwork_name: xpk-gke-a3-megagpu-subnet
+      secondary_ranges:
+        xpk-gke-a3-megagpu-subnet:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: gpunets
+    source: modules/network/multivpc
+    settings:
+      network_name_prefix: $(vars.deployment_name)-gpunet
+      global_ip_address_range: 192.169.0.0/16
+      network_count: 4
+      subnetwork_cidr_suffix: 24
+
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network1, gpunets]
+    settings:
+      enable_private_endpoint: false  # Allows for access from authorized public IPs
+      master_authorized_networks:
+      - cidr_block: $(vars.authorized_cidr)  # Allows your machine run kubectl command. It's required for the multi-network setup.
+        display_name: "kubectl-access-network"
+      #min_master_version: "1.29.6-gke.1326000" # (TODO: Ask MaxText the reason to set this) Couldn't find this version in the valid master versions in australia-southeast1-c. Can be left unset to be set by GKE to the version of the most recent official release.
+      system_node_pool_machine_type: "e2-standard-32"
+    outputs: [instructions]
+
+  - id: group_placement_0
+    source: modules/compute/resource-policy
+    settings:
+      name: xpk-gke-a3-megagpu-gp-np-0
+      group_placement_max_distance: 2
+
+  - id: group_placement_1
+    source: modules/compute/resource-policy
+    settings:
+      name: xpk-gke-a3-megagpu-gp-np-0
+      group_placement_max_distance: 2
+
+  - id: a3_megagpu_pool_0
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster, gpunets, group_placement_0]
+    settings:
+      machine_type: a3-megagpu-8g
+      autoscaling_total_min_nodes: 1
+      initial_node_count: 1
+      zones: [$(vars.zone)]
+      host_maintenance_interval: PERIODIC
+    outputs: [instructions]
+
+  - id: a3_megagpu_pool_1
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster, gpunets, group_placement_1]
+    settings:
+      machine_type: a3-megagpu-8g
+      autoscaling_total_min_nodes: 1
+      initial_node_count: 1
+      zones: [$(vars.zone)]
+      host_maintenance_interval: PERIODIC
+    outputs: [instructions]
+
+  - id: workload_manager_install
+    source: modules/management/kubectl-apply
+    use: [gke_cluster]
+    settings:
+      kueue:
+        install: true
+      jobset:
+        install: true
+
+  - id: topology_aware_scheduler_install
+    source: community/modules/compute/gke-topology-scheduler
+    use: [gke_cluster]
+
+  - id: workload_manager_config
+    source: modules/management/kubectl-apply
+    use: [gke_cluster]
+    settings:
+      apply_manifests:
+      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
+        template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"}
+      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-credentials.yaml.tftpl
+        template_vars: {num_chips: "16"}
diff --git a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
index b412f936e9..9c9a4ab929 100644
--- a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
+++ b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml
@@ -33,6 +33,10 @@ spec:
       - key: "node-role.kubernetes.io/control-plane"
         operator: "Exists"
         effect: "NoSchedule"
+      - key: components.gke.io/gke-managed-components
+        value: "true"
+        operator: Equal
+        effect: NoSchedule
       containers:
       - name: topology-scheduler-container
         image: python:3.9
diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/README.md
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/README.md
rename to modules/compute/resource-policy/README.md
diff --git a/modules/compute/resource-policy/resource-policy/main.tf b/modules/compute/resource-policy/main.tf
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/main.tf
rename to modules/compute/resource-policy/main.tf
diff --git a/modules/compute/resource-policy/resource-policy/metadata.yaml b/modules/compute/resource-policy/metadata.yaml
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/metadata.yaml
rename to modules/compute/resource-policy/metadata.yaml
diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/outputs.tf
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/outputs.tf
rename to modules/compute/resource-policy/outputs.tf
diff --git a/modules/compute/resource-policy/resource-policy/variables.tf b/modules/compute/resource-policy/variables.tf
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/variables.tf
rename to modules/compute/resource-policy/variables.tf
diff --git a/modules/compute/resource-policy/resource-policy/versions.tf b/modules/compute/resource-policy/versions.tf
similarity index 100%
rename from modules/compute/resource-policy/resource-policy/versions.tf
rename to modules/compute/resource-policy/versions.tf

From 1911eb464d9d3b765070c7097c8e1f8471c7a5f9 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Wed, 9 Oct 2024 00:45:17 +0000
Subject: [PATCH 065/102] kueue config file renamed

---
 ...ntials.yaml.tftpl => kueue-xpk-configuration.yaml.tftpl} | 0
 community/examples/xpk-gke-a3-megagpu.yaml                  | 2 +-
 modules/compute/gke-node-pool/outputs.tf                    | 6 +++---
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename community/examples/xpk-gke-a3-megagpu-files/{kueue-credentials.yaml.tftpl => kueue-xpk-configuration.yaml.tftpl} (100%)

diff --git a/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl
similarity index 100%
rename from community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl
rename to community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl
diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
index 999a52ea7b..a5075d8339 100644
--- a/community/examples/xpk-gke-a3-megagpu.yaml
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -114,5 +114,5 @@ deployment_groups:
       apply_manifests:
       - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
         template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"}
-      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-credentials.yaml.tftpl
+      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
         template_vars: {num_chips: "16"}
diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf
index 8be6a2772a..75b63572b1 100644
--- a/modules/compute/gke-node-pool/outputs.tf
+++ b/modules/compute/gke-node-pool/outputs.tf
@@ -73,9 +73,9 @@ locals {
   }
   gpu_direct_instruction = <<-EOT
     Since you are using ${var.machine_type} machine type that has GPUDirect support, your nodepool had been configured with the required plugins.
-    To fully utilize GPUDirect you will need to add the some components into your workload manifest. Details below:
+    To fully utilize GPUDirect you will need to add some components into your workload manifest. Details below:
 
-    A sample GKE job that had GPUDirect enabled and NCCL test included has been generated locally at:
+    A sample GKE job that has GPUDirect enabled and NCCL test included has been generated locally at:
       ${abspath(local.gpu_direct_setting.updated_workload_path)}
 
     You can use the following commands to submit the sample job:
@@ -85,7 +85,7 @@ locals {
       export WORKLOAD_PATH=<>
       python3 ${abspath("${path.module}/gpu-direct-workload/scripts/${lookup(local.script_path, var.machine_type, "")}")} --file $WORKLOAD_PATH --rxdm ${local.gpu_direct_setting.rxdm_version}
     **WARNING**
-    The "--rxdm" version is tide to the nccl-tcpx/o-installer that had been deployed to your cluster, changing it to other value might have impact on performance
+    The "--rxdm" version is tied to the nccl-tcpx/o-installer that had been deployed to your cluster, changing it to other value might have impact on performance
     **WARNING**
 
     Or you can also follow our GPUDirect user guide to update your workload

From d9e131e6ab26e41146f6879ef9105da840a4c1fe Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Thu, 10 Oct 2024 12:01:24 +0000
Subject: [PATCH 066/102] kueue config set using a template file

---
 community/examples/xpk-gke-a3-megagpu.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
index a5075d8339..35ac0df71c 100644
--- a/community/examples/xpk-gke-a3-megagpu.yaml
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -63,13 +63,13 @@ deployment_groups:
   - id: group_placement_0
     source: modules/compute/resource-policy
     settings:
-      name: xpk-gke-a3-megagpu-gp-np-0
+      name: $(vars.deployment_name)-gp-np-0
       group_placement_max_distance: 2
 
   - id: group_placement_1
     source: modules/compute/resource-policy
     settings:
-      name: xpk-gke-a3-megagpu-gp-np-0
+      name: $(vars.deployment_name)-gp-np-0
       group_placement_max_distance: 2
 
   - id: a3_megagpu_pool_0
@@ -94,12 +94,14 @@ deployment_groups:
       host_maintenance_interval: PERIODIC
     outputs: [instructions]
 
-  - id: workload_manager_install
+  - id: workload_component_install
     source: modules/management/kubectl-apply
     use: [gke_cluster]
     settings:
       kueue:
         install: true
+        config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
+        config_template_vars: {num_chips: "16"}
       jobset:
         install: true
 
@@ -107,12 +109,10 @@ deployment_groups:
     source: community/modules/compute/gke-topology-scheduler
     use: [gke_cluster]
 
-  - id: workload_manager_config
+  - id: workload_configmap
     source: modules/management/kubectl-apply
     use: [gke_cluster]
     settings:
       apply_manifests:
       - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
         template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"}
-      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
-        template_vars: {num_chips: "16"}

From 870058336eba1096583b87051b4c41b8256e80d4 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 11 Oct 2024 15:49:15 +0000
Subject: [PATCH 067/102] configmap variable fixed

---
 .../config-map.yaml.tftpl                        |  2 +-
 community/examples/xpk-gke-a3-megagpu.yaml       | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
index 100058b7be..900d30729c 100644
--- a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
+++ b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
@@ -3,4 +3,4 @@ apiVersion: v1
 metadata:
   name: ${name}
 data:
-  h100-mega-80gb-8: ${num_nodes}
+  h100-mega-80gb-8: "${num_nodes}"
diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
index 35ac0df71c..377bf63b83 100644
--- a/community/examples/xpk-gke-a3-megagpu.yaml
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -45,7 +45,7 @@ deployment_groups:
     settings:
       network_name_prefix: $(vars.deployment_name)-gpunet
       global_ip_address_range: 192.169.0.0/16
-      network_count: 4
+      network_count: 8
       subnetwork_cidr_suffix: 24
 
   - id: gke_cluster
@@ -76,9 +76,10 @@ deployment_groups:
     source: modules/compute/gke-node-pool
     use: [gke_cluster, gpunets, group_placement_0]
     settings:
+      name: a3-megagpu-pool-0
       machine_type: a3-megagpu-8g
-      autoscaling_total_min_nodes: 1
-      initial_node_count: 1
+      autoscaling_total_min_nodes: 2
+      initial_node_count: 2
       zones: [$(vars.zone)]
       host_maintenance_interval: PERIODIC
     outputs: [instructions]
@@ -87,9 +88,10 @@ deployment_groups:
     source: modules/compute/gke-node-pool
     use: [gke_cluster, gpunets, group_placement_1]
     settings:
+      name: a3-megagpu-pool-1
       machine_type: a3-megagpu-8g
-      autoscaling_total_min_nodes: 1
-      initial_node_count: 1
+      autoscaling_total_min_nodes: 2
+      initial_node_count: 2
       zones: [$(vars.zone)]
       host_maintenance_interval: PERIODIC
     outputs: [instructions]
@@ -101,7 +103,7 @@ deployment_groups:
       kueue:
         install: true
         config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
-        config_template_vars: {num_chips: "16"}
+        config_template_vars: {num_chips: "32"}
       jobset:
         install: true
 
@@ -115,4 +117,4 @@ deployment_groups:
     settings:
       apply_manifests:
       - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
-        template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"}
+        template_vars: {name: "xpk-gke-a3-megagpu-resources-configmap", num_nodes: "4"}

From 1fbd810e9383901aeba10a107cd1e2ceb09e2685 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 11 Oct 2024 20:16:36 +0000
Subject: [PATCH 068/102] deployment group name fixed

---
 community/examples/xpk-gke-a3-megagpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
index 377bf63b83..21dda328df 100644
--- a/community/examples/xpk-gke-a3-megagpu.yaml
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -69,7 +69,7 @@ deployment_groups:
   - id: group_placement_1
     source: modules/compute/resource-policy
     settings:
-      name: $(vars.deployment_name)-gp-np-0
+      name: $(vars.deployment_name)-gp-np-1
       group_placement_max_distance: 2
 
   - id: a3_megagpu_pool_0

From 79f32bc5b8871b51d9e2051e2a361c5e2919ef0a Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 11 Oct 2024 20:52:59 +0000
Subject: [PATCH 069/102] public cluster disabled

---
 community/examples/xpk-gke-a3-megagpu.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
index 21dda328df..7830b4446a 100644
--- a/community/examples/xpk-gke-a3-megagpu.yaml
+++ b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -52,11 +52,9 @@ deployment_groups:
     source: modules/scheduler/gke-cluster
     use: [network1, gpunets]
     settings:
-      enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
       - cidr_block: $(vars.authorized_cidr)  # Allows your machine run kubectl command. It's required for the multi-network setup.
         display_name: "kubectl-access-network"
-      #min_master_version: "1.29.6-gke.1326000" # (TODO: Ask MaxText the reason to set this) Couldn't find this version in the valid master versions in australia-southeast1-c. Can be left unset to be set by GKE to the version of the most recent official release.
       system_node_pool_machine_type: "e2-standard-32"
     outputs: [instructions]
 

From 0ea478daf01b7ef49214a1cdcbd95418ebf3314e Mon Sep 17 00:00:00 2001
From: Oriol Vilarrubi <jvilarru@schedmd.com>
Date: Wed, 28 Aug 2024 17:53:11 +0200
Subject: [PATCH 070/102] Use sackd for the login nodes

Substitute slurmd for the sackd daemon, this way an x-login partition is
not needed.
---
 .../modules/slurm_files/scripts/conf.py       | 22 +------------------
 .../modules/slurm_files/scripts/setup.py      | 14 +++++-------
 .../modules/slurm_files/scripts/slurmsync.py  |  2 +-
 3 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
index 29b4076056..120ae7f1e8 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py
@@ -27,7 +27,6 @@
 # This file is managed by a script. Manual modifications will be overwritten.
 """
 
-login_nodeset = "x-login"
 
 
 def dict_to_conf(conf, delim=" ") -> str:
@@ -130,24 +129,6 @@ def get(key, default):
     return dict_to_conf(conf_options, delim="\n")
 
 
-def loginlines() -> str:
-    nodeset = {
-        "NodeSet": login_nodeset,
-        "Feature": login_nodeset,
-    }
-    partition = {
-        "PartitionName": login_nodeset,
-        "Nodes": login_nodeset,
-        "State": "UP",
-        "DefMemPerCPU": 1,
-        "Hidden": "YES",
-        "RootOnly": "YES",
-    }
-    lines = [
-        dict_to_conf(nodeset),
-        dict_to_conf(partition),
-    ]
-    return "\n".join(lines)
 
 
 def nodeset_lines(nodeset, lkp: util.Lookup) -> str:
@@ -254,7 +235,7 @@ def suspend_exc_lines(lkp: util.Lookup) -> Iterable[str]:
         for p in lkp.cfg.partitions.values()
         if len(p.partition_nodeset_dyn) > 0
     ]
-    suspend_exc_parts = {"SuspendExcParts": [login_nodeset, *dyn_parts]}
+    suspend_exc_parts = {"SuspendExcParts": [*dyn_parts]}
 
     return filter(
         None,
@@ -270,7 +251,6 @@ def make_cloud_conf(lkp: util.Lookup) -> str:
     lines = [
         FILE_PREAMBLE,
         conflines(lkp),
-        loginlines(),
         *(nodeset_lines(n, lkp) for n in lkp.cfg.nodeset.values()),
         *(nodeset_dyn_lines(n) for n in lkp.cfg.nodeset_dyn.values()),
         *(nodeset_tpu_lines(n, lkp) for n in lkp.cfg.nodeset_tpu.values()),
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
index 589cfeadef..5e3d8b9542 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
@@ -366,27 +366,25 @@ def setup_login():
     slurmctld_host = f"{lookup().control_host}"
     if lookup().control_addr:
         slurmctld_host = f"{lookup().control_host}({lookup().control_addr})"
-    slurmd_options = [
+    sackd_options = [
         f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"',
-        f'--conf="Feature={conf.login_nodeset}"',
-        "-Z",
     ]
-    sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'"""
-    update_system_config("slurmd", sysconf)
+    sysconf = f"""SACKD_OPTIONS='{" ".join(sackd_options)}'"""
+    update_system_config("sackd", sysconf)
     install_custom_scripts()
 
     setup_network_storage()
     setup_sudoers()
     run("systemctl restart munge")
-    run("systemctl enable slurmd", timeout=30)
-    run("systemctl restart slurmd", timeout=30)
+    run("systemctl enable sackd", timeout=30)
+    run("systemctl restart sackd", timeout=30)
     run("systemctl enable --now slurmcmd.timer", timeout=30)
 
     run_custom_scripts()
 
     log.info("Check status of cluster services")
     run("systemctl status munge", timeout=30)
-    run("systemctl status slurmd", timeout=30)
+    run("systemctl status sackd", timeout=30)
 
     log.info("Done setting up login")
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index 0d5f0e6798..5975d68e20 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -428,7 +428,7 @@ def reconfigure_slurm():
             log.exception("failed to reconfigure slurmctld")
         util.run(f"wall '{update_msg}'", timeout=30)
         log.debug("Done.")
-    elif lookup().instance_role_safe in ["compute", "login"]:
+    elif lookup().instance_role_safe == "compute":
         log.info("Restarting slurmd to make changes take effect.")
         run("systemctl restart slurmd")
         util.run(f"wall '{update_msg}'", timeout=30)

From b59f80c204950ef89e3b5c9b545441b042107bb8 Mon Sep 17 00:00:00 2001
From: Oriol Vilarrubi <jvilarru@schedmd.com>
Date: Thu, 29 Aug 2024 22:44:49 +0200
Subject: [PATCH 071/102] Add sackd automatic restart in reconfigure

---
 .../modules/slurm_files/scripts/setup.py                     | 1 +
 .../modules/slurm_files/scripts/slurmsync.py                 | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
index 5e3d8b9542..37532f6285 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py
@@ -216,6 +216,7 @@ def setup_sudoers():
     content = """
 # Allow SlurmUser to manage the slurm daemons
 slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
+slurm ALL= NOPASSWD: /usr/bin/systemctl restart sackd.service
 slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
 """
     sudoers_file = Path("/etc/sudoers.d/slurm")
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
index 5975d68e20..112e2d5748 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py
@@ -433,6 +433,11 @@ def reconfigure_slurm():
         run("systemctl restart slurmd")
         util.run(f"wall '{update_msg}'", timeout=30)
         log.debug("Done.")
+    elif lookup().instance_role_safe == "login":
+        log.info("Restarting sackd to make changes take effect.")
+        run("systemctl restart sackd")
+        util.run(f"wall '{update_msg}'", timeout=30)
+        log.debug("Done.")
 
 
 def update_topology(lkp: util.Lookup) -> None:

From 12c68cb28c1ebd79675c2481dede77164ba00d1c Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 11 Oct 2024 21:37:56 +0000
Subject: [PATCH 072/102] gke-node-pool default name conflict fixed

---
 modules/compute/gke-node-pool/README.md    | 5 ++++-
 modules/compute/gke-node-pool/main.tf      | 6 +++++-
 modules/compute/gke-node-pool/variables.tf | 2 +-
 modules/compute/gke-node-pool/versions.tf  | 4 ++++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 03652cf29e..ec0365556e 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -248,6 +248,7 @@ limitations under the License.
 | <a name="requirement_google"></a> [google](#requirement\_google) | ~> 5.0 |
 | <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | ~> 5.0 |
 | <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
+| <a name="requirement_random"></a> [random](#requirement\_random) | 3.6.3 |
 
 ## Providers
 
@@ -256,6 +257,7 @@ limitations under the License.
 | <a name="provider_google"></a> [google](#provider\_google) | ~> 5.0 |
 | <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | ~> 5.0 |
 | <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
+| <a name="provider_random"></a> [random](#provider\_random) | 3.6.3 |
 
 ## Modules
 
@@ -277,6 +279,7 @@ limitations under the License.
 | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [random_id.nodepool_name_suffix](https://registry.terraform.io/providers/hashicorp/random/3.6.3/docs/resources/id) | resource |
 | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
 | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source |
 
@@ -304,7 +307,7 @@ limitations under the License.
 | <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
-| <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no |
+| <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type and a suffix with a random string. | `string` | `null` | no |
 | <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f391532976..a9b8784a38 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -39,10 +39,14 @@ data "google_compute_default_service_account" "default_sa" {
   project = var.project_id
 }
 
+resource "random_id" "nodepool_name_suffix" {
+  byte_length = 8
+}
+
 resource "google_container_node_pool" "node_pool" {
   provider = google-beta
 
-  name           = var.name == null ? var.machine_type : var.name
+  name           = var.name == null ? "${var.machine_type}-${random_id.nodepool_name_suffix.hex}" : var.name
   cluster        = var.cluster_id
   node_locations = var.zones
 
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index b24aef91df..069b82393f 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -31,7 +31,7 @@ variable "zones" {
 }
 
 variable "name" {
-  description = "The name of the node pool. If left blank, will default to the machine type."
+  description = "The name of the node pool. If left blank, will default to the machine type and a suffix with a random string."
   type        = string
   default     = null
 }
diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf
index 2a27bfc342..d3a6076ed9 100644
--- a/modules/compute/gke-node-pool/versions.tf
+++ b/modules/compute/gke-node-pool/versions.tf
@@ -28,6 +28,10 @@ terraform {
       source  = "hashicorp/null"
       version = "~> 3.0"
     }
+    random = {
+      source  = "hashicorp/random"
+      version = "3.6.3"
+    }
   }
   provider_meta "google" {
     module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0"

From 303d676cf33e9eb55fe331f675d140c5860ca26b Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Fri, 11 Oct 2024 20:58:03 +0000
Subject: [PATCH 073/102] Remove mention of `./[community/]modules` from docs
 and examples

* Remove mention of `./[community/]modules` from docs and examples;
* Added line about not using it with toolkit modules;
* Clean up outdated mentions of "role";
* Update recommendations for `startup-script.source` to use `ghpc_stage`.
---
 .../examples/hpc-slurm-ramble-gromacs.yaml    |  1 -
 .../hpc-slurm-ubuntu2004-v5-legacy.yaml       |  1 -
 community/examples/hpc-slurm-ubuntu2004.yaml  |  1 -
 community/examples/htc-slurm-v5-legacy.yaml   |  1 -
 community/examples/htc-slurm.yaml             |  1 -
 .../examples/tutorial-starccm-slurm.yaml      |  1 -
 .../README.md                                 |  4 +-
 .../schedmd-slurm-gcp-v5-hybrid/README.md     |  2 +-
 .../schedmd-slurm-gcp-v6-controller/README.md |  2 +-
 .../blueprints/create-networks.yaml           |  1 -
 docs/network_storage.md                       |  2 +-
 docs/tutorials/hpc-slurm-qwiklabs.yaml        |  1 -
 examples/README.md                            | 51 ++++---------------
 examples/hpc-enterprise-slurm-v5-legacy.yaml  |  1 -
 examples/hpc-enterprise-slurm.yaml            |  1 -
 examples/hpc-slurm-v5-legacy.yaml             |  1 -
 examples/hpc-slurm.yaml                       |  1 -
 modules/README.md                             |  2 +-
 .../scheduler/batch-job-template/README.md    |  2 +-
 modules/scripts/startup-script/README.md      | 17 +++----
 pkg/config/config_test.go                     |  3 --
 .../daily-tests/validate_tests_metadata.py    |  2 +-
 .../configs/versioned_blueprint.yaml          |  1 -
 .../test_configs/2-network-interfaces.yaml    | 13 +++--
 .../test_configs/2filestore-4instances.yaml   |  6 +--
 .../test_configs/apt-collision.yaml           |  3 +-
 .../test_configs/centos8-ss.yaml              |  8 +--
 .../cloud-batch-cft-instance-template.yaml    |  2 +-
 .../test_configs/debian-ss.yaml               |  8 +--
 .../test_configs/exascaler-existing-vpc.yaml  |  2 +-
 .../test_configs/exascaler-new-vpc.yaml       |  2 +-
 .../test_configs/gpu-v5-legacy.yaml           |  5 +-
 tools/validate_configs/test_configs/gpu.yaml  |  5 +-
 .../test_configs/hpc-centos-ss.yaml           |  8 +--
 .../test_configs/instance-with-startup.yaml   |  2 +-
 .../test_configs/new_project.yaml             |  2 +-
 .../test_configs/nfs-servers.yaml             |  4 +-
 .../test_configs/rocky-ss.yaml                |  8 +--
 .../test_configs/simple-startup.yaml          |  8 +--
 .../test_configs/spack-buildcache.yaml        |  2 +-
 .../test_configs/spack-environments.yaml      |  2 +-
 .../test_configs/startup-options.yaml         | 12 ++---
 .../test_configs/test_outputs.yaml            |  4 +-
 .../test_configs/threads_per_core.yaml        | 21 ++++----
 .../test_configs/timeout_test.yaml            |  6 +--
 .../test_configs/ubuntu-ss.yaml               |  8 +--
 .../test_configs/vm-instance-local-ssd.yaml   |  6 +--
 tools/validate_configs/test_configs/vm.yaml   |  4 +-
 48 files changed, 100 insertions(+), 151 deletions(-)

diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml
index 523b543c53..7a552c3477 100644
--- a/community/examples/hpc-slurm-ramble-gromacs.yaml
+++ b/community/examples/hpc-slurm-ramble-gromacs.yaml
@@ -31,7 +31,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
index 6b1875353a..916fcde74b 100644
--- a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
+++ b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
@@ -34,7 +34,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 
diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml
index 7e89520c05..34037a1052 100644
--- a/community/examples/hpc-slurm-ubuntu2004.yaml
+++ b/community/examples/hpc-slurm-ubuntu2004.yaml
@@ -33,7 +33,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 
diff --git a/community/examples/htc-slurm-v5-legacy.yaml b/community/examples/htc-slurm-v5-legacy.yaml
index d7ff1eccd3..1089cf9904 100644
--- a/community/examples/htc-slurm-v5-legacy.yaml
+++ b/community/examples/htc-slurm-v5-legacy.yaml
@@ -42,7 +42,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/pre-existing-vpc
   - id: network1
     source: modules/network/vpc
 
diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml
index 7165923bbb..9ba26025d7 100644
--- a/community/examples/htc-slurm.yaml
+++ b/community/examples/htc-slurm.yaml
@@ -42,7 +42,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/pre-existing-vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
index ce8dd0817f..b74eb44d33 100644
--- a/community/examples/tutorial-starccm-slurm.yaml
+++ b/community/examples/tutorial-starccm-slurm.yaml
@@ -32,7 +32,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
index 8db3950334..f0fb08ee1d 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md
@@ -17,7 +17,7 @@ be accessed as `tpu` partition.
 
 ```yaml
   - id: tpu_nodeset
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu
     use: [network]
     settings:
       node_type: v2-8
@@ -27,7 +27,7 @@ be accessed as `tpu` partition.
       preserve_tpu: false
 
   - id: tpu_partition
-    source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
     use: [tpu_nodeset]
     settings:
       partition_name: tpu
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
index 9822d36eab..1d62cd393d 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
@@ -73,7 +73,7 @@ The hybrid module can be added to a blueprint as follows:
 
 ```yaml
 - id: slurm-controller
-  source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid
+  source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid
   use:
   - debug-partition
   - compute-partition
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index a9d801d8c7..45d662f7d6 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -134,7 +134,7 @@ example:
 
 ```yaml
   - id: controller
-    source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
     use: [ network, partition ]
     settings:
       enable_slurm_gcp_plugins:
diff --git a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml
index 8c880d1c4c..19f1601f35 100644
--- a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml
@@ -28,7 +28,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network0
     source: modules/network/vpc
     settings:
diff --git a/docs/network_storage.md b/docs/network_storage.md
index e5f8903eb7..28a39594d6 100644
--- a/docs/network_storage.md
+++ b/docs/network_storage.md
@@ -41,7 +41,7 @@ as shown below:
     settings: {local_mount: /home}
 
   - id: workstation
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, homefs]  # Note this line
 ```
 
diff --git a/docs/tutorials/hpc-slurm-qwiklabs.yaml b/docs/tutorials/hpc-slurm-qwiklabs.yaml
index 3a51b77f7a..f4bfc81941 100644
--- a/docs/tutorials/hpc-slurm-qwiklabs.yaml
+++ b/docs/tutorials/hpc-slurm-qwiklabs.yaml
@@ -30,7 +30,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/examples/README.md b/examples/README.md
index 0275fd930c..cc3e7a4412 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1730,10 +1730,9 @@ vars:
 deployment_groups:
 - group: groupName
   modules:
-
-  # Local source, prefixed with ./ (/ and ../ also accepted)
+  # Embedded module (part of the toolkit), prefixed with `modules/` or `community/modules`
   - id: <a unique id> # Required: Name of this module used to uniquely identify it.
-    source: ./modules/role/module-name # Required: Points to the module directory.
+    source: modules/role/module-name # Required
     kind: < terraform | packer > # Optional: Type of module, currently choose from terraform or packer. If not specified, `kind` will default to `terraform`
     # Optional: All configured settings for the module. For terraform, each
     # variable listed in variables.tf can be set here, and are mandatory if no
@@ -1747,14 +1746,18 @@ deployment_groups:
         key3a: value3a
         key3b: value3b
 
-  # Embedded module (part of the toolkit), prefixed with modules/
-  - source: modules/role/module-name
-
   # GitHub module over SSH, prefixed with git@github.com
-  - source: git@github.com:org/repo.git//modules/role/module-name
+  - source: git@github.com:org/repo.git//path/to/module
 
   # GitHub module over HTTPS, prefixed with github.com
-  - source: github.com/org/repo//modules/role/module-name
+  - source: github.com/org/repo//path/to/module
+
+  # Local absolute source, prefixed with / 
+  - source: /path/to/module
+
+  # Local relative (to current working directory) source, prefixed with ./ or ../
+  - source: ../path/to/module
+  # NOTE: Do not reference toolkit modules by local source, use embedded source instead.
 ```
 
 ## Writing an HPC Blueprint
@@ -1847,38 +1850,6 @@ When possible, custom modules should use these roles so that they match other
 modules defined by the toolkit. If a custom module does not fit into these
 roles, a new role can be defined.
 
-A module's parent folder will define the module’s role if possible. Therefore,
-regardless of where the module is located, the module directory should be
-explicitly referenced at least 2 layers deep, where the top layer refers to the
-“role” of that module.
-
-If a module is not defined at least 2 layers deep and the `ghpc_role` label has
-not been explicitly set in settings, ghpc_role will default to `undefined`.
-
-Below we show some of the core modules and their roles (as parent folders).
-
-```text
-modules/
-└── <<ROLE>
-    └── <<MODULE_NAME>>
-
-modules/
-├── compute
-│   └── vm-instance
-├── file-system
-│   ├── pre-existing-network-storage
-│   └── filestore
-├── monitoring
-│   └── dashboard
-├── network
-│   ├── pre-existing-vpc
-│   └── vpc
-├── packer
-│   └── custom-image
-└── scripts
-    └── startup-script
-```
-
 ### Deployment Groups
 
 Deployment groups allow distinct sets of modules to be defined and deployed as a
diff --git a/examples/hpc-enterprise-slurm-v5-legacy.yaml b/examples/hpc-enterprise-slurm-v5-legacy.yaml
index 7c79b818ec..e482a10d15 100644
--- a/examples/hpc-enterprise-slurm-v5-legacy.yaml
+++ b/examples/hpc-enterprise-slurm-v5-legacy.yaml
@@ -53,7 +53,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/pre-existing-vpc
 
diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml
index d7520d3b85..69aeab57dc 100644
--- a/examples/hpc-enterprise-slurm.yaml
+++ b/examples/hpc-enterprise-slurm.yaml
@@ -46,7 +46,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/examples/hpc-slurm-v5-legacy.yaml b/examples/hpc-slurm-v5-legacy.yaml
index 4a5277ee3b..234277208d 100644
--- a/examples/hpc-slurm-v5-legacy.yaml
+++ b/examples/hpc-slurm-v5-legacy.yaml
@@ -30,7 +30,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 
diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml
index 0a90bdcc89..0736772569 100644
--- a/examples/hpc-slurm.yaml
+++ b/examples/hpc-slurm.yaml
@@ -30,7 +30,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/modules/README.md b/modules/README.md
index defba11446..e4974e3264 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -350,7 +350,7 @@ following module definition refers the local pre-existing-vpc modules.
 
 ```yaml
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 ```
 
 > **_NOTE:_** Relative paths (beginning with `.` or `..` must be relative to the
diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md
index 2bbb68fb07..a3a0b176b6 100644
--- a/modules/scheduler/batch-job-template/README.md
+++ b/modules/scheduler/batch-job-template/README.md
@@ -94,7 +94,7 @@ deployment_groups:
       source_image_project: cloud-hpc-image-public
 
   - id: batch-job
-    source: ./modules/scheduler/batch-job-template
+    source: modules/scheduler/batch-job-template
     settings:
       instance_template: $(batch-compute-template.self_link)
     outputs: [instructions]
diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md
index e7f696d08f..c7dc178ae3 100644
--- a/modules/scripts/startup-script/README.md
+++ b/modules/scripts/startup-script/README.md
@@ -29,12 +29,11 @@ Each runner receives the following attributes:
    not.
 - `source`: (Optional) A path to the file or data you want to upload. Must be
   defined if `content` is not. The source path is relative to the deployment
-  group directory. Scripts distributed as part of modules should start with
-  `modules/` followed by the name of the module used (not to be confused with
-  the module ID) and the path to the script. The format is shown below:
+  group directory. To ensure correctness of path use `ghpc_stage` function, that
+  would copy referenced file to the deployment group directory. For example:
 
-    ```text
-    source: ./modules/<<MODULE_NAME>>/<<SCRIPT_NAME>>
+    ```yaml
+    source: $(ghpc_stage("path/to/file"))
     ```
 
   For more examples with context, see the
@@ -188,7 +187,7 @@ For official documentation see troubleshooting docs:
 
 ```yaml
 - id: startup
-  source: ./modules/scripts/startup-script
+  source: modules/scripts/startup-script
   settings:
     runners:
       # Some modules such as filestore have runners as outputs for convenience:
@@ -212,7 +211,7 @@ For official documentation see troubleshooting docs:
         args: "bar.tgz 'Expanding file'"
 
 - id: compute-cluster
-  source: ./modules/compute/vm-instance
+  source: modules/compute/vm-instance
   use: [homefs, startup]
 ```
 
@@ -222,13 +221,13 @@ they are able to do so by using the `gcs_bucket_path` as shown in the below exam
 
 ```yaml
 - id: startup
-  source: ./modules/scripts/startup-script
+  source: modules/scripts/startup-script
   settings:
     gcs_bucket_path: gs://user-test-bucket/folder1/folder2
     install_stackdriver_agent: true
 
 - id: compute-cluster
-  source: ./modules/compute/vm-instance
+  source: modules/compute/vm-instance
   use: [startup]
 ```
 
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
index 8c239a7971..2caabf2697 100644
--- a/pkg/config/config_test.go
+++ b/pkg/config/config_test.go
@@ -537,9 +537,6 @@ func (s *zeroSuite) TestCheckMovedModules(c *C) {
 
 	// embedded moved
 	c.Check(checkMovedModule("community/modules/scheduler/cloud-batch-job"), NotNil)
-
-	// local moved
-	c.Assert(checkMovedModule("./community/modules/scheduler/cloud-batch-job"), NotNil)
 }
 
 func (s *zeroSuite) TestCheckStringLiteral(c *C) {
diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py
index 5f0e60bb66..ee9f4ed6d9 100644
--- a/tools/cloud-build/daily-tests/validate_tests_metadata.py
+++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py
@@ -42,7 +42,7 @@ def module_tag(src: str) -> Optional[str]:
     Remote sources are not supported (None).
     Ex: "modules/network/vpc" -> "m.vpc"
     """
-    if not src.startswith(("modules/", "community/modules/", "./modules/", "./community/modules/")):
+    if not src.startswith(("modules/", "community/modules/")):
         return None
     return f"m.{os.path.basename(src)}"
 
diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
index 6344dd8d76..5240404a3c 100644
--- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml
@@ -48,7 +48,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 
diff --git a/tools/validate_configs/test_configs/2-network-interfaces.yaml b/tools/validate_configs/test_configs/2-network-interfaces.yaml
index f721e06893..dce54ba04c 100644
--- a/tools/validate_configs/test_configs/2-network-interfaces.yaml
+++ b/tools/validate_configs/test_configs/2-network-interfaces.yaml
@@ -27,7 +27,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: default-network
     source: modules/network/pre-existing-vpc
 
@@ -52,7 +51,7 @@ deployment_groups:
 
   # Test adding a pre-existing network via "use"
   - id: one-used-existing-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - default-network
     settings:
@@ -61,7 +60,7 @@ deployment_groups:
 
   # Test adding a newly created network via "use"
   - id: one-used-new-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - new-network-1
     settings:
@@ -70,7 +69,7 @@ deployment_groups:
 
   # Test adding one pre-existing network via "network_interfaces"
   - id: one-explicit-existing-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     settings:
       name_prefix: one-explicit-existing-ni
       machine_type: n2-standard-2
@@ -88,7 +87,7 @@ deployment_groups:
 
   # Test adding one newly created network via "network_interfaces"
   - id: one-explicit-new-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     settings:
       name_prefix: one-explicit-new-ni
       machine_type: n2-standard-2
@@ -106,7 +105,7 @@ deployment_groups:
 
   # Test adding both a pre-existing network and a newly created network via "network_interfaces"
   - id: two-explicit-mixed-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     settings:
       name_prefix: two-explicit-mixed-ni
       network_interfaces:
@@ -136,7 +135,7 @@ deployment_groups:
 
   # Test adding two newly created networks via "network_interfaces"
   - id: two-explicit-new-ni
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     settings:
       name_prefix: two-explicit-new-ni
       network_interfaces:
diff --git a/tools/validate_configs/test_configs/2filestore-4instances.yaml b/tools/validate_configs/test_configs/2filestore-4instances.yaml
index 39fc4ad6cb..4239580c43 100644
--- a/tools/validate_configs/test_configs/2filestore-4instances.yaml
+++ b/tools/validate_configs/test_configs/2filestore-4instances.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: infrastructure
   modules:
   - id: network
-    source: ./modules/network/vpc
+    source: modules/network/vpc
 
   - id: homefs
     source: modules/file-system/filestore
@@ -38,7 +38,7 @@ deployment_groups:
         ghpc_role: storage-home
 
   - id: apps
-    source: ./modules/file-system/filestore
+    source: modules/file-system/filestore
     use: [network]
     settings:
       name: apps
@@ -47,7 +47,7 @@ deployment_groups:
         ghpc_role: storage-apps
 
   - id: license-server-1
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network, homefs]
     settings:
       name_prefix: ls1
diff --git a/tools/validate_configs/test_configs/apt-collision.yaml b/tools/validate_configs/test_configs/apt-collision.yaml
index 9ab7a7e8a3..987fa2c159 100644
--- a/tools/validate_configs/test_configs/apt-collision.yaml
+++ b/tools/validate_configs/test_configs/apt-collision.yaml
@@ -32,13 +32,12 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   ## Network
   - source: modules/network/vpc
     kind: terraform
     id: network1
 
-  - source: ./modules/scripts/startup-script
+  - source: modules/scripts/startup-script
     kind: terraform
     id: startup
     settings:
diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml
index 30a25c1728..ede36e2f10 100644
--- a/tools/validate_configs/test_configs/centos8-ss.yaml
+++ b/tools/validate_configs/test_configs/centos8-ss.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: appsfs
     source: modules/file-system/filestore
@@ -42,7 +42,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
@@ -58,7 +58,7 @@ deployment_groups:
         spack install cmake%gcc@10.3.0 target=x86_64
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       runners:
       - type: data
@@ -74,7 +74,7 @@ deployment_groups:
       - $(spack-execute.spack_runner)
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup, nfs, appsfs]
     settings:
       machine_type: e2-standard-4
diff --git a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml
index 55446e69b6..2df9ca1276 100644
--- a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml
+++ b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml
@@ -59,7 +59,7 @@ deployment_groups:
       source_image_project: cloud-hpc-image-public
 
   - id: batch-job
-    source: ./modules/scheduler/batch-job-template
+    source: modules/scheduler/batch-job-template
     use: [network1, appfs, batch-startup-script]
     settings:
       runnable: "cat /sw/hello.txt"
diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml
index b2a4a3e515..4cf9b7fad8 100644
--- a/tools/validate_configs/test_configs/debian-ss.yaml
+++ b/tools/validate_configs/test_configs/debian-ss.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: appsfs
     source: modules/file-system/filestore
@@ -42,7 +42,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
@@ -58,7 +58,7 @@ deployment_groups:
         spack install cmake%gcc@10.3.0 target=x86_64
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       runners:
       - type: data
@@ -74,7 +74,7 @@ deployment_groups:
       - $(spack-execute.spack_runner)
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup, nfs, appsfs]
     settings:
       machine_type: e2-standard-4
diff --git a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml
index 3215ab4e1c..3476e7e10b 100644
--- a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml
+++ b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: scratchfs
     source: community/modules/file-system/DDN-EXAScaler
diff --git a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml
index 936ab51aa1..cca09d56c5 100644
--- a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml
+++ b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml
@@ -29,7 +29,7 @@ deployment_groups:
     source: modules/network/vpc
 
   - id: scratchfs
-    source: ./community/modules/file-system/DDN-EXAScaler
+    source: community/modules/file-system/DDN-EXAScaler
     use: [network1]
     settings:
       local_mount: /scratch
diff --git a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml b/tools/validate_configs/test_configs/gpu-v5-legacy.yaml
index 611ec58e43..16f4a9fde8 100644
--- a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml
+++ b/tools/validate_configs/test_configs/gpu-v5-legacy.yaml
@@ -62,12 +62,11 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/pre-existing-vpc
 
   - id: nogpu-n1
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - network1
     settings:
@@ -76,7 +75,7 @@ deployment_groups:
       instance_image: $(vars.instance_image_vm)
 
   - id: manual-n1
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - network1
     settings:
diff --git a/tools/validate_configs/test_configs/gpu.yaml b/tools/validate_configs/test_configs/gpu.yaml
index e8064e1534..f12bd323f1 100644
--- a/tools/validate_configs/test_configs/gpu.yaml
+++ b/tools/validate_configs/test_configs/gpu.yaml
@@ -62,12 +62,11 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/pre-existing-vpc
 
   - id: nogpu-n1
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - network
     settings:
@@ -76,7 +75,7 @@ deployment_groups:
       instance_image: $(vars.instance_image_vm)
 
   - id: manual-n1
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use:
     - network
     settings:
diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml
index 076afa9052..6937edc13d 100644
--- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml
+++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: appsfs
     source: modules/file-system/filestore
@@ -42,7 +42,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
@@ -58,7 +58,7 @@ deployment_groups:
         spack install cmake%gcc@10.3.0 target=x86_64
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       runners:
       - type: data
@@ -74,7 +74,7 @@ deployment_groups:
       - $(spack-execute.spack_runner)
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup, nfs, appsfs]
     settings:
       machine_type: e2-standard-4
diff --git a/tools/validate_configs/test_configs/instance-with-startup.yaml b/tools/validate_configs/test_configs/instance-with-startup.yaml
index fd6fb76c2e..caeb85ae63 100644
--- a/tools/validate_configs/test_configs/instance-with-startup.yaml
+++ b/tools/validate_configs/test_configs/instance-with-startup.yaml
@@ -43,6 +43,6 @@ deployment_groups:
       machine_type: e2-standard-8
 
   - id: wait
-    source: ./community/modules/scripts/wait-for-startup
+    source: community/modules/scripts/wait-for-startup
     settings:
       instance_name: $(workstation.name[0])
diff --git a/tools/validate_configs/test_configs/new_project.yaml b/tools/validate_configs/test_configs/new_project.yaml
index 069ac9587d..6a352b169d 100644
--- a/tools/validate_configs/test_configs/new_project.yaml
+++ b/tools/validate_configs/test_configs/new_project.yaml
@@ -24,7 +24,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: project
-    source: ./community/modules/project/new-project
+    source: community/modules/project/new-project
     settings:
       folder_id: 334688113020  # random number
       billing_account: 111110-M2N704-854685  # random billing number
diff --git a/tools/validate_configs/test_configs/nfs-servers.yaml b/tools/validate_configs/test_configs/nfs-servers.yaml
index 676d8feb59..126ac220e5 100644
--- a/tools/validate_configs/test_configs/nfs-servers.yaml
+++ b/tools/validate_configs/test_configs/nfs-servers.yaml
@@ -37,7 +37,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: appsfs
-    source: ./community/modules/file-system/nfs-server
+    source: community/modules/file-system/nfs-server
     use: [network1]
     outputs: [network_storage]
     settings:
@@ -45,7 +45,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: multiple-local-mounts
-    source: ./community/modules/file-system/nfs-server
+    source: community/modules/file-system/nfs-server
     use: [network1]
     outputs: [network_storage]
     settings:
diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml
index 53a5c17cc7..c7bc912dbf 100644
--- a/tools/validate_configs/test_configs/rocky-ss.yaml
+++ b/tools/validate_configs/test_configs/rocky-ss.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: appsfs
     source: modules/file-system/filestore
@@ -45,7 +45,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
@@ -61,7 +61,7 @@ deployment_groups:
         spack install cmake%gcc@10.3.0 target=x86_64
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       runners:
       - type: data
@@ -77,7 +77,7 @@ deployment_groups:
       - $(spack-execute.spack_runner)
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup, nfs, appsfs]
     settings:
       machine_type: e2-standard-4
diff --git a/tools/validate_configs/test_configs/simple-startup.yaml b/tools/validate_configs/test_configs/simple-startup.yaml
index 284a1e3855..8c4016eb1d 100644
--- a/tools/validate_configs/test_configs/simple-startup.yaml
+++ b/tools/validate_configs/test_configs/simple-startup.yaml
@@ -26,10 +26,10 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       runners:
       - type: data
@@ -44,12 +44,12 @@ deployment_groups:
         args: "foo.tgz 'Expanding the file'"
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup]
     settings:
       machine_type: e2-standard-4
 
   - id: waiter
-    source: ./community/modules/scripts/wait-for-startup
+    source: community/modules/scripts/wait-for-startup
     settings:
       instance_name: $(instance.name[0])
diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml
index aaba729742..13f2930f05 100644
--- a/tools/validate_configs/test_configs/spack-buildcache.yaml
+++ b/tools/validate_configs/test_configs/spack-buildcache.yaml
@@ -29,7 +29,7 @@ deployment_groups:
     source: modules/network/pre-existing-vpc
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml
index 94a86065f4..eb260a915f 100644
--- a/tools/validate_configs/test_configs/spack-environments.yaml
+++ b/tools/validate_configs/test_configs/spack-environments.yaml
@@ -29,7 +29,7 @@ deployment_groups:
     source: modules/network/pre-existing-vpc
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
       spack_url: https://github.com/spack/spack
diff --git a/tools/validate_configs/test_configs/startup-options.yaml b/tools/validate_configs/test_configs/startup-options.yaml
index 2d4fd1b303..4ca1555cd7 100644
--- a/tools/validate_configs/test_configs/startup-options.yaml
+++ b/tools/validate_configs/test_configs/startup-options.yaml
@@ -26,10 +26,10 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       ansible_virtualenv_path: /usr/local/ghpc
       runners:
@@ -48,7 +48,7 @@ deployment_groups:
         destination: empty_tasks.yaml
 
   - id: instance-explicit-startup
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1]
     settings:
       name_prefix: explicit
@@ -56,21 +56,21 @@ deployment_groups:
       startup_script: $(startup.startup_script)
 
   - id: instance-no-startup
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1]
     settings:
       name_prefix: no-startup
       machine_type: e2-standard-4
 
   - id: instance-use-startup
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup]
     settings:
       name_prefix: use-startup
       machine_type: e2-standard-4
 
   - id: instance-metadata-startup
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1]
     settings:
       name_prefix: metadata-startup
diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml
index 5de0b7bc21..6bb0bb48d2 100644
--- a/tools/validate_configs/test_configs/test_outputs.yaml
+++ b/tools/validate_configs/test_configs/test_outputs.yaml
@@ -47,7 +47,7 @@ deployment_groups:
     - install_nfs_client
 
   - id: nfs
-    source: ./community/modules/file-system/nfs-server
+    source: community/modules/file-system/nfs-server
     outputs:
     - network_storage
     - install_nfs_client
@@ -131,7 +131,7 @@ deployment_groups:
     - startup_script
 
   - id: lustre
-    source: ./community/modules/file-system/DDN-EXAScaler
+    source: community/modules/file-system/DDN-EXAScaler
     outputs:
     - private_addresses
     - ssh_console
diff --git a/tools/validate_configs/test_configs/threads_per_core.yaml b/tools/validate_configs/test_configs/threads_per_core.yaml
index de06cab879..ed513aedfa 100644
--- a/tools/validate_configs/test_configs/threads_per_core.yaml
+++ b/tools/validate_configs/test_configs/threads_per_core.yaml
@@ -27,13 +27,12 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/pre-existing-vpc
     kind: terraform
 
   - id: n1-2-threads
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -43,7 +42,7 @@ deployment_groups:
       threads_per_core: 2
 
   - id: n1-1-thread
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -53,7 +52,7 @@ deployment_groups:
       threads_per_core: 1
 
   - id: n1-0-threads
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -63,7 +62,7 @@ deployment_groups:
       threads_per_core: 0
 
   - id: n1-null-threads
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -73,7 +72,7 @@ deployment_groups:
       threads_per_core: null
 
   - id: n2-2-threads
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -83,7 +82,7 @@ deployment_groups:
       threads_per_core: 2
 
   - id: n2-1-thread
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -93,7 +92,7 @@ deployment_groups:
       threads_per_core: 1
 
   - id: c2-2-threads
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -103,7 +102,7 @@ deployment_groups:
       threads_per_core: 2
 
   - id: c2-1-thread
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -113,7 +112,7 @@ deployment_groups:
       threads_per_core: 1
 
   - id: e2-medium-0-thread
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
@@ -123,7 +122,7 @@ deployment_groups:
       threads_per_core: 0
 
   - id: e2-medium-null-thread
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     kind: terraform
     use:
     - network1
diff --git a/tools/validate_configs/test_configs/timeout_test.yaml b/tools/validate_configs/test_configs/timeout_test.yaml
index 95305e7923..70a9d37517 100644
--- a/tools/validate_configs/test_configs/timeout_test.yaml
+++ b/tools/validate_configs/test_configs/timeout_test.yaml
@@ -32,21 +32,21 @@ deployment_groups:
     source: modules/network/vpc
 
   - id: gcs
-    source: ./modules/file-system/pre-existing-network-storage
+    source: modules/file-system/pre-existing-network-storage
     settings:
       remote_mount: hpc-toolkit-service-catalog-solutions
       local_mount: /catalog
       fs_type: gcsfuse
 
   - id: compute-hpc-image
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, gcs]
     settings:
       machine_type: n2-standard-2
       name_prefix: hpc-image
 
   - id: wait
-    source: ./community/modules/scripts/wait-for-startup
+    source: community/modules/scripts/wait-for-startup
     settings:
       instance_name: $(compute-hpc-image.name[0])
       timeout: 25
diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml
index 67a15a8437..8f1d40a0ce 100644
--- a/tools/validate_configs/test_configs/ubuntu-ss.yaml
+++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml
@@ -26,7 +26,7 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: appsfs
     source: modules/file-system/filestore
@@ -42,7 +42,7 @@ deployment_groups:
       auto_delete_disk: true
 
   - id: spack-setup
-    source: ./community/modules/scripts/spack-setup
+    source: community/modules/scripts/spack-setup
     settings:
       install_dir: /apps/spack
 
@@ -61,7 +61,7 @@ deployment_groups:
         spack install fftw%intel@18.0.5 target=skylake ^intel-mpi@2018.4.274%intel@18.0.5 target=x86_64
 
   - id: startup
-    source: ./modules/scripts/startup-script
+    source: modules/scripts/startup-script
     settings:
       install_stackdriver_agent: true
       runners:
@@ -78,7 +78,7 @@ deployment_groups:
       - $(spack-execute.spack_runner)
 
   - id: instance
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1, startup, nfs, appsfs]
     settings:
       machine_type: e2-standard-4
diff --git a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml
index 7adcc33496..467e0641fa 100644
--- a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml
+++ b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml
@@ -26,10 +26,10 @@ deployment_groups:
 - group: primary
   modules:
   - id: network1
-    source: ./modules/network/pre-existing-vpc
+    source: modules/network/pre-existing-vpc
 
   - id: multi-instance-multi-ssd
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1]
     settings:
       machine_type: n2-standard-16
@@ -37,7 +37,7 @@ deployment_groups:
       local_ssd_count: 2
 
   - id: instance-ssd-interface-defined
-    source: ./modules/compute/vm-instance
+    source: modules/compute/vm-instance
     use: [network1]
     settings:
       machine_type: n2-standard-16
diff --git a/tools/validate_configs/test_configs/vm.yaml b/tools/validate_configs/test_configs/vm.yaml
index 8de1ce67e2..4941721125 100644
--- a/tools/validate_configs/test_configs/vm.yaml
+++ b/tools/validate_configs/test_configs/vm.yaml
@@ -30,7 +30,7 @@ deployment_groups:
   - id: network1
     source: modules/network/pre-existing-vpc
 
-  - source: ./modules/compute/vm-instance
+  - source: modules/compute/vm-instance
     id: compute_instances_family
     use: [network1]
     settings:
@@ -47,7 +47,7 @@ deployment_groups:
         # project: $(vars.project_id)
         # family: myubuntu
 
-  - source: ./modules/compute/vm-instance
+  - source: modules/compute/vm-instance
     id: compute_instances_name
     use: [network1]
     settings:

From ca66b8c6e3e55eeb26f90e928cc9c6a43ca46acd Mon Sep 17 00:00:00 2001
From: Rachael Tamakloe <rtamakloe@google.com>
Date: Wed, 9 Oct 2024 18:47:35 +0000
Subject: [PATCH 074/102] guest_accelerator modifications

---
 .../compute/htcondor-execute-point/gpu_definition.tf        | 4 ++--
 .../schedmd-slurm-gcp-v5-node-group/gpu_definition.tf       | 4 ++--
 .../schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf  | 4 ++--
 .../compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf  | 4 ++--
 .../schedmd-slurm-gcp-v5-controller/gpu_definition.tf       | 4 ++--
 .../scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf  | 4 ++--
 .../schedmd-slurm-gcp-v6-controller/gpu_definition.tf       | 4 ++--
 .../scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf  | 4 ++--
 modules/compute/gke-node-pool/gpu_definition.tf             | 4 ++--
 modules/compute/gke-node-pool/main.tf                       | 6 +++---
 modules/compute/gke-node-pool/reservation_definitions.tf    | 2 +-
 modules/compute/vm-instance/gpu_definition.tf               | 4 ++--
 modules/compute/vm-instance/main.tf                         | 2 +-
 13 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf
+++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/modules/compute/gke-node-pool/gpu_definition.tf
+++ b/modules/compute/gke-node-pool/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index f391532976..f7ef813496 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -23,7 +23,7 @@ locals {
   sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email
 
   preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family)
-  has_gpu                        = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family
+  has_gpu                        = (local.guest_accelerator != null && (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0)) || local.preattached_gpu_machine_family
   gpu_taint = local.has_gpu ? [{
     key    = "nvidia.com/gpu"
     value  = "present"
@@ -89,13 +89,13 @@ resource "google_container_node_pool" "node_pool" {
     image_type      = var.image_type
 
     dynamic "guest_accelerator" {
-      for_each = local.guest_accelerator
+      for_each = { for idx, ga in local.guest_accelerator : idx => ga if ga.count > 0 }
       content {
         type                           = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, ""))
         count                          = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0"))
         gpu_driver_installation_config = coalescelist(try(guest_accelerator.value.gpu_driver_installation_config, []), [{ gpu_driver_version = "DEFAULT" }])
         gpu_partition_size             = try(guest_accelerator.value.gpu_partition_size, "")
-        gpu_sharing_config             = try(guest_accelerator.value.gpu_sharing_config, [])
+        gpu_sharing_config             = try(guest_accelerator.value.gpu_sharing_config, null)
       }
     }
 
diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf
index d40cc5b01f..a75246b185 100644
--- a/modules/compute/gke-node-pool/reservation_definitions.tf
+++ b/modules/compute/gke-node-pool/reservation_definitions.tf
@@ -55,7 +55,7 @@ locals {
   }]
   nodepool_vm_properties = {
     "machine_type" : var.machine_type
-    "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) => coalesce(acc.count, try(local.generated_guest_accelerator[0].count, 0)) },
+    "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : (acc.count > 0 ? coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) : "") => acc.count if acc.count > 0 },
     "local_ssds" : {
       "NVME" : coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0),
       "SCSI" : coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0)
diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf
index 6c5d96d286..c6c3944332 100644
--- a/modules/compute/vm-instance/gpu_definition.tf
+++ b/modules/compute/vm-instance/gpu_definition.tf
@@ -47,11 +47,11 @@ locals {
     "g2-standard-48" = { type = "nvidia-l4", count = 4 },
     "g2-standard-96" = { type = "nvidia-l4", count = 8 },
   }
-  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [])
+  generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }])
 
   # Select in priority order:
   # (1) var.guest_accelerator if not empty
   # (2) local.generated_guest_accelerator if not empty
   # (3) default to empty list if both are empty
-  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [])
+  guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }])
 }
diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf
index 683fa77682..01207d701f 100644
--- a/modules/compute/vm-instance/main.tf
+++ b/modules/compute/vm-instance/main.tf
@@ -39,7 +39,7 @@ locals {
   # compact_placement : true when placement policy is provided and collocation set; false if unset
   compact_placement = try(var.placement_policy.collocation, null) != null
 
-  gpu_attached = contains(["a2", "g2"], local.machine_family) || length(local.guest_accelerator) > 0
+  gpu_attached = contains(["a2", "g2"], local.machine_family) || (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0)
 
   # both of these must be false if either compact placement or preemptible/spot instances are used
   # automatic restart is tolerant of GPUs while on host maintenance is not

From 12f6d3fedbc59fdc4b86f10ac222b081e3dbb09e Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Fri, 11 Oct 2024 23:50:51 +0000
Subject: [PATCH 075/102] random_id replaced by module_id

---
 modules/compute/gke-node-pool/README.md     |  6 ++----
 modules/compute/gke-node-pool/main.tf       |  8 +++-----
 modules/compute/gke-node-pool/metadata.yaml |  2 ++
 modules/compute/gke-node-pool/variables.tf  | 10 +++++++++-
 modules/compute/gke-node-pool/versions.tf   |  4 ----
 5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index ec0365556e..797327807c 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -248,7 +248,6 @@ limitations under the License.
 | <a name="requirement_google"></a> [google](#requirement\_google) | ~> 5.0 |
 | <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | ~> 5.0 |
 | <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
-| <a name="requirement_random"></a> [random](#requirement\_random) | 3.6.3 |
 
 ## Providers
 
@@ -257,7 +256,6 @@ limitations under the License.
 | <a name="provider_google"></a> [google](#provider\_google) | ~> 5.0 |
 | <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | ~> 5.0 |
 | <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
-| <a name="provider_random"></a> [random](#provider\_random) | 3.6.3 |
 
 ## Modules
 
@@ -279,7 +277,6 @@ limitations under the License.
 | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
-| [random_id.nodepool_name_suffix](https://registry.terraform.io/providers/hashicorp/random/3.6.3/docs/resources/id) | resource |
 | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
 | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source |
 
@@ -301,13 +298,14 @@ limitations under the License.
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
+| <a name="input_ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch"></a> [ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch](#input\_ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
-| <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type and a suffix with a random string. | `string` | `null` | no |
+| <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix.<br/>If setting manually, ensure a unique value across all gke-node-pools. | `string` | `null` | no |
 | <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
 | <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index a9b8784a38..6bd6f74274 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -33,20 +33,18 @@ locals {
   autoscale_set    = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000
   static_node_set  = var.static_node_count != null
   initial_node_set = try(var.initial_node_count > 0, false)
+
+  module_unique_id = replace(lower(var.ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch), "/[^a-z0-9]/", "")
 }
 
 data "google_compute_default_service_account" "default_sa" {
   project = var.project_id
 }
 
-resource "random_id" "nodepool_name_suffix" {
-  byte_length = 8
-}
-
 resource "google_container_node_pool" "node_pool" {
   provider = google-beta
 
-  name           = var.name == null ? "${var.machine_type}-${random_id.nodepool_name_suffix.hex}" : var.name
+  name           = var.name == null ? "${var.machine_type}-${local.module_unique_id}" : var.name
   cluster        = var.cluster_id
   node_locations = var.zones
 
diff --git a/modules/compute/gke-node-pool/metadata.yaml b/modules/compute/gke-node-pool/metadata.yaml
index bd1517ce8f..06ab5ae608 100644
--- a/modules/compute/gke-node-pool/metadata.yaml
+++ b/modules/compute/gke-node-pool/metadata.yaml
@@ -17,3 +17,5 @@ spec:
   requirements:
     services:
     - container.googleapis.com
+ghpc:
+  inject_module_id: ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index 069b82393f..c85e524fcc 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -31,11 +31,19 @@ variable "zones" {
 }
 
 variable "name" {
-  description = "The name of the node pool. If left blank, will default to the machine type and a suffix with a random string."
+  description = <<-EOD
+    The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix.
+    If setting manually, ensure a unique value across all gke-node-pools.
+    EOD
   type        = string
   default     = null
 }
 
+variable "ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch" {
+  description = "Populates with module id (unique blueprint-wide)."
+  type        = string
+}
+
 variable "machine_type" {
   description = "The name of a Google Compute Engine machine type."
   type        = string
diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf
index d3a6076ed9..2a27bfc342 100644
--- a/modules/compute/gke-node-pool/versions.tf
+++ b/modules/compute/gke-node-pool/versions.tf
@@ -28,10 +28,6 @@ terraform {
       source  = "hashicorp/null"
       version = "~> 3.0"
     }
-    random = {
-      source  = "hashicorp/random"
-      version = "3.6.3"
-    }
   }
   provider_meta "google" {
     module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0"

From 5d1eed07a88f47f489c58fce02d2a99bc86d9f39 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 9 Oct 2024 20:54:04 +0000
Subject: [PATCH 076/102] Update Slurm-gcp v5 deprecation details

---
 .../schedmd-slurm-gcp-v5-controller/README.md     |  7 +++++++
 .../schedmd-slurm-gcp-v5-hybrid/README.md         |  8 ++++++++
 .../schedmd-slurm-gcp-v5-login/README.md          |  7 +++++++
 examples/README.md                                | 15 +++++++--------
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
index 501f2b0dba..b2a1bb503e 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
@@ -1,5 +1,12 @@
 ## Description
 
+> [!NOTE]
+> Slurm-gcp-v5-controller module is deprecated. See
+> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
+> and timelines.
+
+* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+
 This module creates a slurm controller node via the [SchedMD/slurm-gcp]
 [slurm\_controller\_instance] and [slurm\_instance\_template] modules.
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
index 9822d36eab..bc58f82f9c 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
@@ -1,4 +1,12 @@
 ## Description
+
+> [!NOTE]
+> Slurm-gcp-v5-hybrid module is deprecated. See
+> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
+> and timelines.
+
+* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+
 This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD
 as part of the [slurm-gcp] github repository. The hybrid module serves to create
 the configurations needed to extend an on-premise slurm cluster to one with one
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
index a43636cd8c..80d969ade6 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
@@ -1,5 +1,12 @@
 ## Description
 
+> [!NOTE]
+> Slurm-gcp-v5-login module is deprecated. See
+> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
+> and timelines.
+
+* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+
 This module creates a login node for a Slurm cluster based on the
 [SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance]
 terraform modules. The login node is used in conjunction with the
diff --git a/examples/README.md b/examples/README.md
index 0275fd930c..7dec270823 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -2,7 +2,7 @@
 
 > [!NOTE]
 > Migration to Slurm-GCP v6 is completed. See
-> [this update](#ongoing-migration-to-slurm-gcp-v6) for specific recommendations
+> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
 > and timelines.
 
 This directory contains a set of example blueprint files that can be fed into
@@ -15,7 +15,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
 
 * [Instructions](#instructions)
   * [(Optional) Setting up a remote terraform state](#optional-setting-up-a-remote-terraform-state)
-* [Ongoing Migration to Slurm-GCP v6](#ongoing-migration-to-slurm-gcp-v6)
+* [Completed Migration to Slurm-GCP v6](#completed-migration-to-slurm-gcp-v6)
 * [Blueprint Descriptions](#blueprint-descriptions)
   * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml-) ![core-badge]
   * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge]
@@ -141,7 +141,7 @@ subcommands as well:
 [configuration block]: https://developer.hashicorp.com/terraform/language/settings/backends/configuration#using-a-backend-block
 [gcs]: https://developer.hashicorp.com/terraform/language/settings/backends/gcs
 
-## Ongoing Migration to Slurm-GCP v6
+## Completed Migration to Slurm-GCP v6
 
 [Slurm-GCP](https://github.com/GoogleCloudPlatform/slurm-gcp) is the set of
 scripts and tools that automate the installation, deployment, and certain
@@ -150,16 +150,15 @@ Google Cloud Platform. It is recommended to use Slurm-GCP through the Cluster
 Toolkit where it is exposed as various modules.
 
 The Cluster Toolkit team has finished transitioning from Slurm-GCP v5 to Slurm-GCP v6 and
-now Slurm-GCP v6 is the recommended option. Following this, blueprint naming would be
-as follows:
+as of 10/11/2024, Slurm-GCP v6 is the recommended option. Blueprint naming is as
+follows:
 
 * Slurm v5: hpc-slurm-v5-legacy.yaml
 * Slurm v6: hpc-slurm.yaml
 
 > [!IMPORTANT]
-> Three months after Slurm-gcp V6 becomes the recommended version, Slurm v5
-> modules will be marked as deprecated and will be maintained in our repo for
-> another three months, at which point the modules will be removed from the Cluster
+> Slurm-GCP v5 modules are now marked as deprecated and will be maintained in our
+> repo till January 6, 2025. After that, the modules will be removed from the Cluster
 > Toolkit repo and regression tests will no longer run for V5. Those who choose
 > to not upgrade to V6 will still be able to use V5 modules by referencing
 > specific git tags in the module source lines.

From f0d7803b1f0e709e584d72fe7d451183755d46ae Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Sat, 12 Oct 2024 00:31:26 +0000
Subject: [PATCH 077/102] module id variable name modified

---
 modules/compute/gke-node-pool/README.md     | 2 +-
 modules/compute/gke-node-pool/main.tf       | 4 ++--
 modules/compute/gke-node-pool/metadata.yaml | 2 +-
 modules/compute/gke-node-pool/variables.tf  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index 797327807c..efdcb22b95 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -298,8 +298,8 @@ limitations under the License.
 | <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
-| <a name="input_ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch"></a> [ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch](#input\_ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
+| <a name="input_internal_ghpc_module_id"></a> [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index 6bd6f74274..6e16771a3a 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -34,7 +34,7 @@ locals {
   static_node_set  = var.static_node_count != null
   initial_node_set = try(var.initial_node_count > 0, false)
 
-  module_unique_id = replace(lower(var.ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch), "/[^a-z0-9]/", "")
+  module_unique_id = replace(lower(var.internal_ghpc_module_id), "/[^a-z0-9\\-]/", "")
 }
 
 data "google_compute_default_service_account" "default_sa" {
@@ -44,7 +44,7 @@ data "google_compute_default_service_account" "default_sa" {
 resource "google_container_node_pool" "node_pool" {
   provider = google-beta
 
-  name           = var.name == null ? "${var.machine_type}-${local.module_unique_id}" : var.name
+  name           = coalesce(var.name, "${var.machine_type}-${local.module_unique_id}")
   cluster        = var.cluster_id
   node_locations = var.zones
 
diff --git a/modules/compute/gke-node-pool/metadata.yaml b/modules/compute/gke-node-pool/metadata.yaml
index 06ab5ae608..e980d595a2 100644
--- a/modules/compute/gke-node-pool/metadata.yaml
+++ b/modules/compute/gke-node-pool/metadata.yaml
@@ -18,4 +18,4 @@ spec:
     services:
     - container.googleapis.com
 ghpc:
-  inject_module_id: ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch
+  inject_module_id: internal_ghpc_module_id
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index c85e524fcc..a258cf8a47 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -39,7 +39,7 @@ variable "name" {
   default     = null
 }
 
-variable "ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch" {
+variable "internal_ghpc_module_id" {
   description = "Populates with module id (unique blueprint-wide)."
   type        = string
 }

From e2ea01b946975e784bbc17ad60b82d4ff674d5c5 Mon Sep 17 00:00:00 2001
From: Farhad Sharabiani <sharabiani@google.com>
Date: Sat, 12 Oct 2024 21:23:04 +0000
Subject: [PATCH 078/102] module id variable decription updated

---
 modules/compute/gke-node-pool/README.md    | 2 +-
 modules/compute/gke-node-pool/variables.tf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index efdcb22b95..d15f644b80 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -299,7 +299,7 @@ limitations under the License.
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_internal_ghpc_module_id"></a> [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes |
+| <a name="input_internal_ghpc_module_id"></a> [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide). | `string` | n/a | yes |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
 | <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
index a258cf8a47..f5f31abde0 100644
--- a/modules/compute/gke-node-pool/variables.tf
+++ b/modules/compute/gke-node-pool/variables.tf
@@ -40,7 +40,7 @@ variable "name" {
 }
 
 variable "internal_ghpc_module_id" {
-  description = "Populates with module id (unique blueprint-wide)."
+  description = "DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide)."
   type        = string
 }
 

From d8b287953981153089071c6774b3c1b83e13becb Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Sun, 13 Oct 2024 19:29:21 +0000
Subject: [PATCH 079/102] fix comment

---
 examples/gke-storage-parallelstore.yaml      |  4 +++-
 modules/file-system/gke-storage/README.md    |  4 ++--
 modules/file-system/gke-storage/variables.tf | 11 ++++++-----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml
index daecc6657e..413e523da7 100644
--- a/examples/gke-storage-parallelstore.yaml
+++ b/examples/gke-storage-parallelstore.yaml
@@ -24,7 +24,7 @@ vars:
   authorized_cidr: <your-ip-address>/32
 
 deployment_groups:
-- group: primary
+- group: setup
   modules:
   - id: network
     source: modules/network/vpc
@@ -43,6 +43,8 @@ deployment_groups:
     settings:
       prefix_length: 24
 
+- group: primary
+  modules:
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
     use: [network]
diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
index c578a4a0d8..17c718aa37 100644
--- a/modules/file-system/gke-storage/README.md
+++ b/modules/file-system/gke-storage/README.md
@@ -5,7 +5,7 @@ to dynamically provision GCP storage resources like Parallelstore.
 
 ### Example
 
-The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Peresistent Volume Claim,
+The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Persistent Volume Claim,
 then use them in a `gke-job-template` to dynamically provision the resource.
 
 ```yaml
@@ -119,7 +119,7 @@ No resources.
 | <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
 | <a name="input_sc_topology_zones"></a> [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no |
 | <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br/>Supported value:<br/>- Immediate<br/>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
-| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore<br/>- Hyperdisk-balanced<br/>- Hyperdisk-throughput<br/>- Hyperdisk-extreme | `string` | n/a | yes |
+| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore | `string` | n/a | yes |
 
 ## Outputs
 
diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf
index 97ff1af21b..9ad3b839d8 100644
--- a/modules/file-system/gke-storage/variables.tf
+++ b/modules/file-system/gke-storage/variables.tf
@@ -34,15 +34,12 @@ variable "storage_type" {
   The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
   to used. This module currently support dynamic provisioning for the below storage options
   - Parallelstore
-  - Hyperdisk-balanced
-  - Hyperdisk-throughput
-  - Hyperdisk-extreme
   EOT 
   type        = string
   nullable    = false
   validation {
-    condition     = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type))
-    error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"."
+    condition     = var.storage_type == null ? false : contains(["parallelstore"], lower(var.storage_type))
+    error_message = "Allowed string values for var.storage_type are \"Parallelstore\"."
   }
 }
 
@@ -110,6 +107,10 @@ variable "pv_mount_path" {
   description = "Path within the container at which the volume should be mounted. Must not contain ':'."
   type        = string
   default     = "/data"
+  validation {
+    condition     = var.pv_mount_path == null ? true : !strcontains(var.pv_mount_path, ":")
+    error_message = "pv_mount_path must not contain ':', please correct it and retry"
+  }
 }
 
 variable "mount_options" {

From b0217dece2627de4d6c28c9162fa2810d1142279 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 9 Oct 2024 20:28:19 +0000
Subject: [PATCH 080/102] Mark slurm-gcp v5 version of blueprints as deprecated

---
 examples/README.md | 40 ++++++++++++++++++++--------------------
 modules/README.md  | 22 +++++++++++-----------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 7dec270823..9bd2c60b1c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -17,36 +17,36 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
   * [(Optional) Setting up a remote terraform state](#optional-setting-up-a-remote-terraform-state)
 * [Completed Migration to Slurm-GCP v6](#completed-migration-to-slurm-gcp-v6)
 * [Blueprint Descriptions](#blueprint-descriptions)
-  * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml-) ![core-badge]
+  * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge]
-  * [hpc-enterprise-slurm-v5-legacy.yaml](#hpc-enterprise-slurm-v5-legacyyaml-) ![core-badge]
+  * [hpc-enterprise-slurm-v5-legacy.yaml](#hpc-enterprise-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge]
   * [hpc-slurm-static.yaml](#hpc-slurm-staticyaml-) ![core-badge]
   * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm6-tpu-maxtext.yaml](#hpc-slurm6-tpu-maxtextyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm6-apptainer.yaml](#hpc-slurm6-apptaineryaml--) ![community-badge] ![experimental-badge]
-  * [ml-slurm-v5-legacy.yaml](#ml-slurm-v5-legacyyaml-) ![core-badge]
+  * [ml-slurm-v5-legacy.yaml](#ml-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge]
-  * [image-builder-v5-legacy.yaml](#image-builder-v5-legacyyaml-) ![core-badge]
+  * [image-builder-v5-legacy.yaml](#image-builder-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [image-builder.yaml](#image-builderyaml--) ![core-badge]
   * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge]
   * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge]
   * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge]
   * [ps-slurm.yaml](#ps-slurmyaml--) ![core-badge] ![experimental-badge]
   * [pfs-parallelstore.yaml](#pfs-parallelstoreyaml--) ![core-badge] ![experimental-badge]
-  * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml-) ![core-badge]
+  * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge]
   * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge]
   * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml-) ![community-badge]
+  * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml--) ![community-badge] ![deprecated-badge]
   * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml--) ![community-badge]
   * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge]
   * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge]
-  * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml-) ![community-badge]
+  * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml--) ![community-badge] ![deprecated-badge]
   * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge]
   * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml--) ![community-badge] ![experimental-badge]
   * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-local-ssd-v5-legacy.yaml](#hpc-slurm-local-ssd-v5-legacyyaml--) ![community-badge] ![experimental-badge]
+  * [hpc-slurm-local-ssd-v5-legacy.yaml](#hpc-slurm-local-ssd-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [hpc-slurm-local-ssd.yaml](#hpc-slurm-local-ssdyaml--) ![community-badge] ![experimental-badge]
   * [hcls-blueprint.yaml](#hcls-blueprintyaml-) ![core-badge]
   * [hpc-gke.yaml](#hpc-gkeyaml--) ![core-badge] ![experimental-badge]
@@ -54,14 +54,14 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
   * [storage-gke](#storage-gkeyaml--) ![core-badge] ![experimental-badge]
   * [gke-a3-megagpu](#gke-a3-megagpuyaml--) ![core-badge] ![experimental-badge]
   * [gke-a3-highgpu](#gke-a3-highgpuyaml--) ![core-badge] ![experimental-badge]
-  * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml--) ![community-badge] ![experimental-badge]
+  * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge]
   * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge]
   * [fsi-montecarlo-on-batch.yaml](#fsi-montecarlo-on-batchyaml-) ![community-badge] ![experimental-badge]
   * [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge]
   * [tutorial-starccm.yaml](#tutorial-starccmyaml--) ![community-badge] ![experimental-badge]
   * [hpc-slurm-ramble-gromacs.yaml](#hpc-slurm-ramble-gromacsyaml--) ![community-badge] ![experimental-badge]
-  * [hpc-slurm-chromedesktop-v5-legacy.yaml](#hpc-slurm-chromedesktop-v5-legacyyaml--) ![community-badge] ![experimental-badge]
+  * [hpc-slurm-chromedesktop-v5-legacy.yaml](#hpc-slurm-chromedesktop-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
   * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge]
   * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge]
   * [omnia-cluster.yaml](#omnia-clusteryaml---) ![community-badge] ![experimental-badge] ![deprecated-badge]
@@ -210,7 +210,7 @@ Toolkit team, partners, etc.) and are labeled with the community badge
 Blueprints that are still in development and less stable are also labeled with
 the experimental badge (![experimental-badge]).
 
-### [hpc-slurm-v5-legacy.yaml] ![core-badge]
+### [hpc-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
 
 > **Warning**: The variables `enable_reconfigure`,
 > `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to
@@ -319,7 +319,7 @@ For this example the following is needed in the selected region:
 * Compute Engine API: Resource policies: **one for each job in parallel** -
   _only needed for the `compute` partition_
 
-### [hpc-enterprise-slurm-v5-legacy.yaml] ![core-badge]
+### [hpc-enterprise-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
 
 This advanced blueprint creates a cluster with Slurm with several performance
 tunings enabled, along with tiered file systems for higher performance. Some of
@@ -551,7 +551,7 @@ This blueprint creates a custom [Apptainer](https:https://apptainer.org) enabled
 
 [hpc-slurm6-apptainer.yaml]: ../community/examples/hpc-slurm6-apptainer.yaml
 
-### [ml-slurm-v5-legacy.yaml] ![core-badge]
+### [ml-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
 
 This blueprint provisions an HPC cluster running the Slurm scheduler with the
 machine learning frameworks PyTorch and TensorFlow pre-installed on every
@@ -649,7 +649,7 @@ timestamp for uniqueness.
 
 [ml-slurm.yaml]: ../examples/ml-slurm.yaml
 
-### [image-builder-v5-legacy.yaml] ![core-badge]
+### [image-builder-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
 
 This blueprint uses the [Packer template module][pkr] to create a custom VM
 image and uses it to provision an HPC cluster using the Slurm scheduler. By
@@ -1058,7 +1058,7 @@ For this example the following is needed in the selected region:
 [pfs-parallelstore.yaml]: ./pfs-parallelstore.yaml
 [Parallelstore]: ../modules/file-system/parallelstore/README.md
 
-### [cae-slurm-v5-legacy.yaml] ![core-badge]
+### [cae-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge]
 
 The Computer Aided Engineering (CAE) blueprint captures a reference architecture
 where the right cloud components are assembled to optimally cater to the
@@ -1143,7 +1143,7 @@ The blueprint contains 3 groups:
 
 [hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml
 
-### [hpc-slurm-ubuntu2004-v5-legacy.yaml] ![community-badge]
+### [hpc-slurm-ubuntu2004-v5-legacy.yaml] ![community-badge] ![deprecated-badge]
 
 > **Warning**: The variables `enable_reconfigure`,
 > `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to
@@ -1230,7 +1230,7 @@ examples][intel-examples-readme].
 
 [hpc-slurm-daos.yaml]: ../community/examples/intel/hpc-slurm-daos.yaml
 
-### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge]
+### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] ![deprecated-badge]
 
 This example provisions a Slurm cluster using AMD VM machine types. It
 automates the initial setup of Spack, including a script that can be used to
@@ -1398,7 +1398,7 @@ the nodes are provisioned. All nodes mount a filestore instance on `/home`.
 [omnia-github]: https://github.com/dellhpc/omnia
 [omnia-cluster.yaml]: ../community/examples/omnia-cluster.yaml
 
-### [hpc-slurm-local-ssd-v5-legacy.yaml] ![community-badge] ![experimental-badge]
+### [hpc-slurm-local-ssd-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
 
 This blueprint demonstrates the use of Slurm and Filestore, with the definition
 of a partition which deploys compute nodes that have local ssd drives deployed.
@@ -1594,7 +1594,7 @@ walks through the use of this blueprint.
 [htc-htcondor.yaml]: ../community/examples/htc-htcondor.yaml
 [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm
 
-### [htc-slurm-v5-legacy.yaml] ![community-badge] ![experimental-badge]
+### [htc-slurm-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
 
 This blueprint provisions a cluster using the Slurm scheduler in a configuration
 tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs.
@@ -1659,7 +1659,7 @@ tutorial.
 
 [tutorial-fluent.yaml]: ../community/examples/tutorial-fluent.yaml
 
-### [hpc-slurm-chromedesktop-v5-legacy.yaml] ![community-badge] ![experimental-badge]
+### [hpc-slurm-chromedesktop-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge]
 
 This example shows how to use the `chrome-remote-desktop` module with a Slurm
 partition to be able to `salloc` a GPU accelerated remote desktop.
diff --git a/modules/README.md b/modules/README.md
index defba11446..c562c111cd 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -35,17 +35,17 @@ Modules that are still in development and less stable are labeled with the
 ### Compute
 
 * **[vm-instance]** ![core-badge] : Creates one or more VM instances.
-* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] :
+* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] ![deprecated-badge] :
   Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v5-controller].
-* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] :
+* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] ![deprecated-badge]:
   Creates a node group to be used by the [schedmd-slurm-gcp-v5-partition] module.
-* **[schedmd-slurm-gcp-v6-partition]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-partition]** ![core-badge] :
   Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v6-controller].
-* **[schedmd-slurm-gcp-v6-nodeset]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-nodeset]** ![core-badge] :
   Creates a nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module.
-* **[schedmd-slurm-gcp-v6-nodeset-tpu]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-nodeset-tpu]** ![core-badge] :
   Creates a TPU nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module.
-* **[schedmd-slurm-gcp-v6-nodeset-dynamic]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-nodeset-dynamic]** ![core-badge] ![experimental-badge]:
   Creates a dynamic nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module and instance template.
 * **[gke-node-pool]** ![core-badge] ![experimental-badge] : Creates a
   Kubernetes node pool using GKE.
@@ -194,15 +194,15 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca
 * **[gke-cluster]** ![core-badge] ![experimental-badge] : Creates a
   Kubernetes cluster using GKE.
 * **[pre-existing-gke-cluster]** ![core-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module.
-* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] :
+* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] ![deprecated-badge] :
   Creates a Slurm controller node using [slurm-gcp-version-5].
-* **[schedmd-slurm-gcp-v5-login]** ![community-badge] :
+* **[schedmd-slurm-gcp-v5-login]** ![community-badge] ![deprecated-badge] :
   Creates a Slurm login node using [slurm-gcp-version-5].
-* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] :
+* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] ![deprecated-badge] :
   Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5].
-* **[schedmd-slurm-gcp-v6-controller]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-controller]** ![core-badge] :
   Creates a Slurm controller node using [slurm-gcp-version-6].
-* **[schedmd-slurm-gcp-v6-login]** ![community-badge] ![experimental-badge]:
+* **[schedmd-slurm-gcp-v6-login]** ![core-badge] :
   Creates a Slurm login node using [slurm-gcp-version-6].
 * **[htcondor-setup]** ![community-badge] ![experimental-badge] : Creates the
   base infrastructure for an HTCondor pool (service accounts and Cloud Storage bucket).

From 7f21690841f3247d2711e3c9df4f7cbbcc6c2775 Mon Sep 17 00:00:00 2001
From: Akiki Liang <asq@google.com>
Date: Thu, 10 Oct 2024 18:39:29 +0000
Subject: [PATCH 081/102] Update a3-high NeMo version 23.11 to 24.07

---
 .../machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile  | 4 ++--
 .../machine-learning/a3-highgpu-8g/nemo-framework/README.md   | 4 ++--
 .../a3-highgpu-8g/nemo-framework/setup_nemo.sh                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
index 3f6196a45e..c693c2d7cb 100644
--- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
+++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG NEMOFW_VERSION=23.11
-FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION}.framework
+ARG NEMOFW_VERSION=24.07
+FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION}
 
 ENV USE_TCPX=yes
 ENV NCCL_NET=GPUDirectTCPX_v7
diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
index 37ecdd9600..4440dff882 100644
--- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
+++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
@@ -3,7 +3,7 @@ README
 
 1. Set up NeMo Framework Container
 
-   This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:23.11.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
+   This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
    container, and submits a Slurm job to copy the framework launcher scripts and a
    few other auxiliary files into your working directory.
 
@@ -45,7 +45,7 @@ README
        launcher_scripts_path=${PWD} \
        stages=[training] \
        env_vars.TRANSFORMERS_OFFLINE=0 \
-       container=../nemofw+tcpx-23.11.sqsh \
+       container=../nemofw+tcpx-24.07.sqsh \
        container_mounts=[${HOME}/.cache,"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"] \
        cluster.srun_args=["--container-writable"] \
        training.model.data.data_impl=mock \
diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh
index 008c0f21c2..5692b0342b 100644
--- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh
+++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh
@@ -18,7 +18,7 @@
 #SBATCH --partition=a3
 #SBATCH --exclusive
 
-: "${NEMOFW_VERSION:=23.11}"
+: "${NEMOFW_VERSION:=24.07}"
 
 srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemofw:tcpx-"${NEMOFW_VERSION}" .
 srun rm -f nemofw+tcpx-"${NEMOFW_VERSION}".sqsh
@@ -27,4 +27,4 @@ srun enroot import dockerd://nemofw:tcpx-"${NEMOFW_VERSION}"
 srun \
 	--container-mounts="${PWD}":/workspace/mount_dir,/var/tmp:/var/tmp \
 	--container-image=./nemofw+tcpx-"${NEMOFW_VERSION}".sqsh \
-	bash -c "cp -r /opt/NeMo-Megatron-Launcher/requirements.txt /opt/NeMo-Megatron-Launcher/launcher_scripts /opt/NeMo-Megatron-Launcher/auto_configurator /workspace/mount_dir/"
+	bash -c "cp -r /opt/NeMo-Framework-Launcher/requirements.txt /opt/NeMo-Framework-Launcher/launcher_scripts /opt/NeMo-Framework-Launcher/auto_configurator /workspace/mount_dir/"

From b2c0de6e3af73ab82cd15c7e67bff9bc690ba73e Mon Sep 17 00:00:00 2001
From: Akiki Liang <asq@google.com>
Date: Mon, 14 Oct 2024 19:25:20 +0000
Subject: [PATCH 082/102] update container_mounts to work with hydra quote
 rules

---
 .../machine-learning/a3-highgpu-8g/nemo-framework/README.md     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
index 4440dff882..9eb9252106 100644
--- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
+++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
@@ -46,7 +46,7 @@ README
        stages=[training] \
        env_vars.TRANSFORMERS_OFFLINE=0 \
        container=../nemofw+tcpx-24.07.sqsh \
-       container_mounts=[${HOME}/.cache,"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"] \
+       container_mounts='['${HOME}/.cache',"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"]' \
        cluster.srun_args=["--container-writable"] \
        training.model.data.data_impl=mock \
        training.model.data.data_prefix=[] \

From 64646bcd885a659de2db4d2bb02c40fbdd654130 Mon Sep 17 00:00:00 2001
From: Akiki Liang <asq@google.com>
Date: Mon, 14 Oct 2024 19:28:38 +0000
Subject: [PATCH 083/102] update a3-high scripts with latest recommended values

---
 .../a3-highgpu-8g/nccl-tests/run-nccl-tests.sh           | 8 +++-----
 .../nccl-tests/run-topological-nccl-tests.sh             | 8 +++-----
 .../a3-highgpu-8g/nemo-framework/Dockerfile              | 9 ++++-----
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh
index cbc80a3763..988dc5df3e 100644
--- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh
+++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh
@@ -51,8 +51,6 @@ if [[ ${USE_TCPX} = "yes" ]]; then
 	export NCCL_PROTO=Simple
 	export NCCL_NSOCKS_PERTHREAD=4
 	export NCCL_SOCKET_NTHREADS=1
-	export NCCL_MAX_NCHANNELS=12
-	export NCCL_MIN_NCHANNELS=12
 	export NCCL_DYNAMIC_CHUNK_SIZE=524288
 	export NCCL_P2P_NET_CHUNKSIZE=524288
 	export NCCL_P2P_PCI_CHUNKSIZE=524288
@@ -62,9 +60,9 @@ if [[ ${USE_TCPX} = "yes" ]]; then
 	export NCCL_NET_GDR_LEVEL=PIX
 	export NCCL_P2P_PXN_LEVEL=0
 	export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
-	export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
-	export NCCL_GPUDIRECTTCPX_FORCE_ACK=0
-	export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
+	export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
+	export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
+	export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
 
 	export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
 else
diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh
index 4177a3e184..d42cda9404 100644
--- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh
+++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh
@@ -52,8 +52,6 @@ if [[ ${USE_TCPX} = "yes" ]]; then
 	export NCCL_PROTO=Simple
 	export NCCL_NSOCKS_PERTHREAD=4
 	export NCCL_SOCKET_NTHREADS=1
-	export NCCL_MAX_NCHANNELS=12
-	export NCCL_MIN_NCHANNELS=12
 	export NCCL_DYNAMIC_CHUNK_SIZE=524288
 	export NCCL_P2P_NET_CHUNKSIZE=524288
 	export NCCL_P2P_PCI_CHUNKSIZE=524288
@@ -63,9 +61,9 @@ if [[ ${USE_TCPX} = "yes" ]]; then
 	export NCCL_NET_GDR_LEVEL=PIX
 	export NCCL_P2P_PXN_LEVEL=0
 	export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
-	export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
-	export NCCL_GPUDIRECTTCPX_FORCE_ACK=0
-	export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
+	export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
+	export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
+	export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
 
 	export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
 else
diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
index c693c2d7cb..a4264709bc 100644
--- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
+++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile
@@ -25,8 +25,6 @@ ENV NCCL_ALGO=Ring
 ENV NCCL_PROTO=Simple
 ENV NCCL_NSOCKS_PERTHREAD=4
 ENV NCCL_SOCKET_NTHREADS=1
-ENV NCCL_MAX_NCHANNELS=12
-ENV NCCL_MIN_NCHANNELS=12
 ENV NCCL_DYNAMIC_CHUNK_SIZE=524288
 ENV NCCL_P2P_NET_CHUNKSIZE=524288
 ENV NCCL_P2P_PCI_CHUNKSIZE=524288
@@ -36,9 +34,10 @@ ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 ENV NCCL_NET_GDR_LEVEL=PIX
 ENV NCCL_P2P_PXN_LEVEL=0
 ENV NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=/run/tcpx
-ENV NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000
-ENV NCCL_GPUDIRECTTCPX_FORCE_ACK=0
-ENV NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000
+ENV NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
+ENV NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
+ENV NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"
+
 
 RUN echo "/var/lib/tcpx/lib64" >> /etc/ld.so.conf.d/tcpx.conf && ldconfig
 ENV LD_LIBRARY_PATH=/var/lib/tcpx/lib64:$LD_LIBRARY_PATH

From d32ca70d4ac176925b7f6642e343dcf411c386d0 Mon Sep 17 00:00:00 2001
From: Akiki Liang <asq@google.com>
Date: Mon, 14 Oct 2024 19:30:00 +0000
Subject: [PATCH 084/102] mount /var/tmp for build nccl tests

---
 .../a3-highgpu-8g/nccl-tests/build-nccl-tests.sh                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh
index 6d669de4a9..11f18eee21 100644
--- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh
+++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh
@@ -25,7 +25,7 @@ set -x
 CONTAINER_IMAGE=./nvidia+pytorch+23.10-py3.sqsh
 
 # Install nccl-tests using openmpi from within pytorch container
-srun --container-mounts="$PWD:/nccl" \
+srun --container-mounts="$PWD:/nccl,/var/tmp:/var/tmp" \
 	--container-image=${CONTAINER_IMAGE} \
 	--container-name="nccl" \
 	bash -c "

From 6f2bc8ab63ee3f3c76f44eb10952678a45be883f Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Fri, 11 Oct 2024 18:04:24 +0000
Subject: [PATCH 085/102] Add mount parallelstore service to mount
 parallelstore for every reboot

---
 .../parallelstore/scripts/mount-daos.sh       | 25 ++++++++++++++++++-
 .../scripts/mount-daos.sh                     | 25 ++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh
index 2b09f2e6d4..bb64c9a4d3 100644
--- a/modules/file-system/parallelstore/scripts/mount-daos.sh
+++ b/modules/file-system/parallelstore/scripts/mount-daos.sh
@@ -48,6 +48,7 @@ if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION
 	mkdir -p /var/log/daos_agent
 	chown daos_agent:daos_agent /var/log/daos_agent
 	sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config
+	systemctl enable daos_agent.service
 	systemctl start daos_agent.service
 elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then
 	mkdir -p /var/run/daos_agent
@@ -73,7 +74,7 @@ for i in {1..10}; do
 	# shellcheck disable=SC2086
 	dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break
 
-	echo "dfuse failed, retrying in 1 seconds (attempt $i/5)..."
+	echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..."
 	sleep 1
 done
 
@@ -81,4 +82,26 @@ if ! mountpoint -q "$local_mount"; then
 	exit 1
 fi
 
+# Store the mounting logic in a variable
+mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done'
+
+# --- Begin: Add systemd service creation ---
+cat >/usr/lib/systemd/system/mount_parallelstore.service <<EOF
+[Unit]
+Description=DAOS Mount Service
+After=network-online.target daos_agent.service
+
+[Service]
+Type=oneshot
+User=root
+Group=root
+ExecStart=/bin/bash -c '$mount_command'
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+systemctl enable mount_parallelstore.service
+# --- End: Add systemd service creation ---
+
 exit 0
diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
index 2b09f2e6d4..bb64c9a4d3 100644
--- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
+++ b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh
@@ -48,6 +48,7 @@ if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION
 	mkdir -p /var/log/daos_agent
 	chown daos_agent:daos_agent /var/log/daos_agent
 	sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config
+	systemctl enable daos_agent.service
 	systemctl start daos_agent.service
 elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then
 	mkdir -p /var/run/daos_agent
@@ -73,7 +74,7 @@ for i in {1..10}; do
 	# shellcheck disable=SC2086
 	dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break
 
-	echo "dfuse failed, retrying in 1 seconds (attempt $i/5)..."
+	echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..."
 	sleep 1
 done
 
@@ -81,4 +82,26 @@ if ! mountpoint -q "$local_mount"; then
 	exit 1
 fi
 
+# Store the mounting logic in a variable
+mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done'
+
+# --- Begin: Add systemd service creation ---
+cat >/usr/lib/systemd/system/mount_parallelstore.service <<EOF
+[Unit]
+Description=DAOS Mount Service
+After=network-online.target daos_agent.service
+
+[Service]
+Type=oneshot
+User=root
+Group=root
+ExecStart=/bin/bash -c '$mount_command'
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+systemctl enable mount_parallelstore.service
+# --- End: Add systemd service creation ---
+
 exit 0

From b3dd7d9a133b5326ddf22369a2e36279ce67bdc2 Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Mon, 14 Oct 2024 23:35:50 +0000
Subject: [PATCH 086/102] Create and use non-default service accounts in GKE

---
 examples/gke-a3-highgpu.yaml                  | 16 ++++-
 examples/gke-a3-megagpu.yaml                  | 16 ++++-
 examples/hpc-gke.yaml                         | 16 ++++-
 examples/ml-gke.yaml                          | 17 +++++-
 examples/storage-gke.yaml                     | 30 +++++++++-
 modules/compute/gke-node-pool/README.md       |  7 ---
 modules/compute/gke-node-pool/main.tf         | 45 --------------
 modules/scheduler/gke-cluster/README.md       |  8 +--
 modules/scheduler/gke-cluster/main.tf         | 48 ++-------------
 .../daily-tests/blueprints/ml-gke-e2e.yaml    | 58 +++++++++++++++++--
 .../daily-tests/builds/gke-a3-megagpu.yaml    |  1 +
 .../daily-tests/builds/gke-storage.yaml       |  1 +
 tools/cloud-build/daily-tests/builds/gke.yaml |  1 +
 .../daily-tests/builds/ml-gke-e2e.yaml        |  1 +
 .../daily-tests/builds/ml-gke.yaml            |  1 +
 15 files changed, 148 insertions(+), 118 deletions(-)

diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml
index 44f5a8ff33..f7f4018b0d 100644
--- a/examples/gke-a3-highgpu.yaml
+++ b/examples/gke-a3-highgpu.yaml
@@ -40,6 +40,18 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gpunets
     source: modules/network/multivpc
     settings:
@@ -50,7 +62,7 @@ deployment_groups:
 
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1, gpunets]
+    use: [network1, gpunets, gke_service_account]
     settings:
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
@@ -60,7 +72,7 @@ deployment_groups:
 
   - id: a3_highgpu_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, gpunets]
+    use: [gke_cluster, gpunets, gke_service_account]
     settings:
       machine_type: a3-highgpu-8g
       autoscaling_total_min_nodes: 2
diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml
index 56ea759b5d..30edb3974c 100644
--- a/examples/gke-a3-megagpu.yaml
+++ b/examples/gke-a3-megagpu.yaml
@@ -40,6 +40,18 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gpunets
     source: modules/network/multivpc
     settings:
@@ -50,7 +62,7 @@ deployment_groups:
 
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1, gpunets]
+    use: [network1, gpunets, gke_service_account]
     settings:
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
@@ -60,7 +72,7 @@ deployment_groups:
 
   - id: a3_megagpu_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, gpunets]
+    use: [gke_cluster, gpunets, gke_service_account]
     settings:
       machine_type: a3-megagpu-8g
       autoscaling_total_min_nodes: 2
diff --git a/examples/hpc-gke.yaml b/examples/hpc-gke.yaml
index dccdee033b..f927fd8169 100644
--- a/examples/hpc-gke.yaml
+++ b/examples/hpc-gke.yaml
@@ -35,16 +35,28 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-service-account
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1]
+    use: [network1, gke_service_account]
     settings:
       enable_private_endpoint: false  # Allows for access from authorized public IPs
     outputs: [instructions]
 
   - id: compute_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, gke_service_account]
 
   - id: job-template
     source: modules/compute/gke-job-template
diff --git a/examples/ml-gke.yaml b/examples/ml-gke.yaml
index 5aedd354fb..cbce0a6c1a 100644
--- a/examples/ml-gke.yaml
+++ b/examples/ml-gke.yaml
@@ -40,19 +40,32 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1]
+    use: [network1, gke_service_account]
     settings:
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
       - display_name: deployment-machine
         cidr_block: $(vars.authorized_cidr)
+      configure_workload_identity_sa: true
     outputs: [instructions]
 
   - id: g2_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, gke_service_account]
     settings:
       disk_type: pd-balanced
       machine_type: g2-standard-4
diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml
index cd46c2d9c3..00c3d60290 100644
--- a/examples/storage-gke.yaml
+++ b/examples/storage-gke.yaml
@@ -38,9 +38,33 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
+  - id: local_ssd_pool_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: ssd-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1]
+    use: [network1, gke_service_account]
     settings:
       enable_filestore_csi: true
       enable_gcsfuse_csi: true
@@ -53,7 +77,7 @@ deployment_groups:
 
   - id: debug_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: debug
       zones: [$(vars.zone)]
@@ -118,7 +142,7 @@ deployment_groups:
 
   - id: local-ssd-pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, local_ssd_pool_service_account]
     settings:
       name: local-ssd
       machine_type: n2d-standard-2
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index d15f644b80..880e1834e4 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -268,16 +268,9 @@ limitations under the License.
 | Name | Type |
 |------|------|
 | [google-beta_google_container_node_pool.node_pool](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource |
-| [google_project_iam_member.node_service_account_artifact_registry](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_gcr](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_log_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
-| [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
 | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source |
 
 ## Inputs
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
index 6e16771a3a..d14801ccf3 100644
--- a/modules/compute/gke-node-pool/main.tf
+++ b/modules/compute/gke-node-pool/main.tf
@@ -20,8 +20,6 @@ locals {
 }
 
 locals {
-  sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email
-
   preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family)
   has_gpu                        = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family
   gpu_taint = local.has_gpu ? [{
@@ -37,10 +35,6 @@ locals {
   module_unique_id = replace(lower(var.internal_ghpc_module_id), "/[^a-z0-9\\-]/", "")
 }
 
-data "google_compute_default_service_account" "default_sa" {
-  project = var.project_id
-}
-
 resource "google_container_node_pool" "node_pool" {
   provider = google-beta
 
@@ -239,45 +233,6 @@ resource "google_container_node_pool" "node_pool" {
   }
 }
 
-# For container logs to show up under Cloud Logging and GKE metrics to show up
-# on Cloud Monitoring console, some project level roles are needed for the
-# node_service_account
-resource "google_project_iam_member" "node_service_account_log_writer" {
-  project = var.project_id
-  role    = "roles/logging.logWriter"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_metric_writer" {
-  project = var.project_id
-  role    = "roles/monitoring.metricWriter"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_monitoring_viewer" {
-  project = var.project_id
-  role    = "roles/monitoring.viewer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_resource_metadata_writer" {
-  project = var.project_id
-  role    = "roles/stackdriver.resourceMetadata.writer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_gcr" {
-  project = var.project_id
-  role    = "roles/storage.objectViewer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_artifact_registry" {
-  project = var.project_id
-  role    = "roles/artifactregistry.reader"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
 resource "null_resource" "install_dependencies" {
   provisioner "local-exec" {
     command = "pip3 install pyyaml argparse"
diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md
index 583af203da..2a96a90c5d 100644
--- a/modules/scheduler/gke-cluster/README.md
+++ b/modules/scheduler/gke-cluster/README.md
@@ -131,14 +131,8 @@ limitations under the License.
 |------|------|
 | [google-beta_google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_cluster) | resource |
 | [google-beta_google_container_node_pool.system_node_pools](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource |
-| [google_project_iam_member.node_service_account_artifact_registry](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_gcr](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_log_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
-| [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
-| [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
+| [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source |
 
 ## Inputs
 
diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf
index 480d5b7d58..ac39cebe73 100644
--- a/modules/scheduler/gke-cluster/main.tf
+++ b/modules/scheduler/gke-cluster/main.tf
@@ -29,7 +29,8 @@ locals {
     security_group = var.authenticator_security_group
   }]
 
-  sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email
+  default_sa_email = "${data.google_project.project.number}-compute@developer.gserviceaccount.com"
+  sa_email         = coalesce(var.service_account_email, local.default_sa_email)
 
   # additional VPCs enable multi networking 
   derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0)
@@ -38,8 +39,8 @@ locals {
   derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking)
 }
 
-data "google_compute_default_service_account" "default_sa" {
-  project = var.project_id
+data "google_project" "project" {
+  project_id = var.project_id
 }
 
 resource "google_container_cluster" "gke_cluster" {
@@ -267,45 +268,6 @@ resource "google_container_node_pool" "system_node_pools" {
   }
 }
 
-# For container logs to show up under Cloud Logging and GKE metrics to show up
-# on Cloud Monitoring console, some project level roles are needed for the
-# node_service_account
-resource "google_project_iam_member" "node_service_account_log_writer" {
-  project = var.project_id
-  role    = "roles/logging.logWriter"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_metric_writer" {
-  project = var.project_id
-  role    = "roles/monitoring.metricWriter"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_monitoring_viewer" {
-  project = var.project_id
-  role    = "roles/monitoring.viewer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_resource_metadata_writer" {
-  project = var.project_id
-  role    = "roles/stackdriver.resourceMetadata.writer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_gcr" {
-  project = var.project_id
-  role    = "roles/storage.objectViewer"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
-resource "google_project_iam_member" "node_service_account_artifact_registry" {
-  project = var.project_id
-  role    = "roles/artifactregistry.reader"
-  member  = "serviceAccount:${local.sa_email}"
-}
-
 data "google_client_config" "default" {}
 
 provider "kubernetes" {
@@ -327,7 +289,7 @@ module "workload_identity" {
 
   # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/1059
   depends_on = [
-    data.google_compute_default_service_account.default_sa,
+    data.google_project.project,
     google_container_cluster.gke_cluster
   ]
 }
diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
index 6e64a667a1..20f5ff19f5 100644
--- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
@@ -40,9 +40,21 @@ deployment_groups:
         - range_name: services
           ip_cidr_range: 10.0.32.0/20
 
+  - id: gke_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: gke-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
-    use: [network1]
+    use: [network1, gke_service_account]
     settings:
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       master_authorized_networks:
@@ -52,7 +64,7 @@ deployment_groups:
 
   - id: g2_latest_driver
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: g2-latest-driver
       machine_type: g2-standard-4
@@ -80,9 +92,21 @@ deployment_groups:
       ]
     outputs: [instructions]
 
+  - id: n1_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: n1-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: n1_pool_default
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, n1_service_account]
     settings:
       name: n1-pool-default
       disk_type: pd-balanced
@@ -108,9 +132,21 @@ deployment_groups:
       ]
     outputs: [instructions]
 
+  - id: n1_full_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: n1-full-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: n1_pool_full_spec
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, n1_full_service_account]
     settings:
       name: n1-pool-full-spec
       disk_type: pd-balanced
@@ -141,9 +177,21 @@ deployment_groups:
       ]
     outputs: [instructions]
 
+  - id: default_settings_service_account
+    source: community/modules/project/service-account
+    settings:
+      name: ds-sa
+      project_roles:
+      - logging.logWriter
+      - monitoring.metricWriter
+      - monitoring.viewer
+      - stackdriver.resourceMetadata.writer
+      - storage.objectViewer
+      - artifactregistry.reader
+
   - id: default_settings_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster]
+    use: [gke_cluster, default_settings_service_account]
     settings:
       name: default-settings-pool
 
diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
index 118704e7ea..05c5ce1097 100644
--- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml
@@ -16,6 +16,7 @@
 tags:
 - m.gke-cluster
 - m.gke-node-pool
+- m.service-account
 - m.vpc
 - m.multivpc
 - m.kubectl-apply
diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml
index 16d8b92587..1e4a11998a 100644
--- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml
@@ -17,6 +17,7 @@ tags:
 - m.cloud-storage-bucket
 - m.filestore
 - m.gke-cluster
+- m.service-account
 - m.gke-job-template
 - m.gke-node-pool
 - m.gke-persistent-volume
diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml
index 709a2b5c1b..b73409a94f 100644
--- a/tools/cloud-build/daily-tests/builds/gke.yaml
+++ b/tools/cloud-build/daily-tests/builds/gke.yaml
@@ -17,6 +17,7 @@ tags:
 - m.gke-cluster
 - m.gke-job-template
 - m.gke-node-pool
+- m.service-account
 - m.vpc
 - gke
 
diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml
index 4b04ceb7d0..caeeee66fa 100644
--- a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml
+++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml
@@ -17,6 +17,7 @@ tags:
 - m.gke-cluster
 - m.gke-job-template
 - m.gke-node-pool
+- m.service-account
 - m.vpc
 - gke
 
diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml
index c9ae96850f..a3b83c6fa8 100644
--- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml
+++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml
@@ -17,6 +17,7 @@ tags:
 - m.gke-cluster
 - m.gke-job-template
 - m.gke-node-pool
+- m.service-account
 - m.vpc
 - gke
 

From 69d67465825647e88b0fc96b9ee4030e314f1166 Mon Sep 17 00:00:00 2001
From: ighosh98 <indraneelghosh@google.com>
Date: Tue, 15 Oct 2024 07:25:09 +0000
Subject: [PATCH 087/102] GKE A3 high integration test

---
 .../daily-tests/builds/gke-a3-highgpu.yaml    | 66 +++++++++++++++++++
 .../daily-tests/tests/gke-a3-highgpu.yml      | 43 ++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml
 create mode 100644 tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml

diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml
new file mode 100644
index 0000000000..2ad20f6b8d
--- /dev/null
+++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml
@@ -0,0 +1,66 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+tags:
+- m.gke-cluster
+- m.gke-node-pool
+- m.vpc
+- m.multivpc
+- m.service-account
+- m.kubectl-apply
+- gke
+
+timeout: 14400s  # 4hr
+steps:
+- id: gke-a3-highgpu
+  name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
+  entrypoint: /bin/bash
+  env:
+  - "ANSIBLE_HOST_KEY_CHECKING=false"
+  - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
+  args:
+  - -c
+  - |
+    set -x -e
+    cd /workspace && make
+    BUILD_ID_FULL=$BUILD_ID
+    BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
+    EXAMPLE_BP=examples/gke-a3-highgpu.yaml
+
+    # Replacing the static subnet name to prevent collisions
+    sed  -i "s/gke-subnet-a3-highgpu/gke-subnet-a3-highgpu-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP}
+
+    # adding vm to act as remote node
+    echo '  - id: remote-node'                           >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/vm-instance'       >> $${EXAMPLE_BP}
+    echo '    use: [network1]'                           >> $${EXAMPLE_BP}
+    echo '    settings:'                                 >> $${EXAMPLE_BP}
+    echo '      machine_type: e2-standard-2'             >> $${EXAMPLE_BP}
+    echo '      name_prefix: remote-node'                >> $${EXAMPLE_BP}
+    echo '      add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
+    echo ''
+    echo '  - id: job_template_hostname'                       >> $${EXAMPLE_BP}
+    echo '    source: modules/compute/gke-job-template'        >> $${EXAMPLE_BP}
+    echo '    use: [a3_highgpu_pool]'                          >> $${EXAMPLE_BP}
+    echo '    settings:'                                       >> $${EXAMPLE_BP}
+    echo '      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
+    echo '      command:'                                      >> $${EXAMPLE_BP}
+    echo '      - nvidia-smi'                                  >> $${EXAMPLE_BP}
+    echo '      node_count: 1'                                 >> $${EXAMPLE_BP}
+    echo '    outputs: [instructions]'                         >> $${EXAMPLE_BP}
+
+    ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
+        --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
+        --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml"
diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml
new file mode 100644
index 0000000000..26b894a6fe
--- /dev/null
+++ b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml
@@ -0,0 +1,43 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# region, zone must be defined
+# in build file with --extra-vars flag!
+test_name: gke-a3high
+deployment_name: gke-a3high-{{ build }}
+workspace: /workspace
+blueprint_yaml: "{{ workspace }}/examples/gke-a3-highgpu.yaml"
+network: "gke-a3high-net-{{ build }}"
+region: us-west1
+zone: us-west1-a
+remote_node: "{{ deployment_name }}-remote-node-0"
+reservation_affinity:
+  consume_reservation_type: SPECIFIC_RESERVATION
+  specific_reservations:
+  - name: a3-reservation-0
+    project: "{{ project }}"
+cli_deployment_vars:
+  region: "{{ region }}"
+  zone: "{{ zone }}"
+  reservation_affinity: "{{ reservation_affinity }}"
+  autoscaling_total_max_nodes: 2
+  authorized_cidr: "{{ build_ip.stdout }}/32"
+  network_name: "{{ network }}"
+  local_ssd_count_nvme_block: 16
+custom_vars:
+  project: "{{ project }}"
+post_deploy_tests:
+- test-validation/test-gke-job.yml

From 31f13b9f13f57004acf157d09703f7484a2788f9 Mon Sep 17 00:00:00 2001
From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com>
Date: Tue, 15 Oct 2024 09:51:14 -0400
Subject: [PATCH 088/102] Update modules/scripts/startup-script/README.md

Co-authored-by: Tom Downes <tpdownes@users.noreply.github.com>
---
 modules/scripts/startup-script/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md
index b9cae7fdee..3d2f7cbcda 100644
--- a/modules/scripts/startup-script/README.md
+++ b/modules/scripts/startup-script/README.md
@@ -162,6 +162,7 @@ curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh
 sudo bash add-logging-agent-repo.sh --also-install
 sudo service stackdriver-agent start
 ```
+
 #### Cloud Ops Agent Installation
 
 If an image or machine already has the Stackdriver Agent installed and you would

From 92cddfdf09306f5bd41848d08f14595785cc8bd0 Mon Sep 17 00:00:00 2001
From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com>
Date: Tue, 15 Oct 2024 09:51:23 -0400
Subject: [PATCH 089/102] Update modules/scripts/startup-script/README.md

Co-authored-by: Tom Downes <tpdownes@users.noreply.github.com>
---
 modules/scripts/startup-script/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md
index 3d2f7cbcda..a32c2c5509 100644
--- a/modules/scripts/startup-script/README.md
+++ b/modules/scripts/startup-script/README.md
@@ -186,7 +186,7 @@ sudo bash add-google-cloud-ops-agent-repo.sh --also-install
 sudo service google-cloud-ops-agent start
 ```
 
-As a reminder, this should be in a startup script, which should run on all 
+As a reminder, this should be in a startup script, which should run on all
 Compute nodes via the `compute_startup_script` on the controller.
 
 #### Testing Installation

From 8eedf55701fec2bca8533c0210bf0242821ce758 Mon Sep 17 00:00:00 2001
From: abbas1902 <abbasmohamed@google.com>
Date: Thu, 10 Oct 2024 18:37:14 +0000
Subject: [PATCH 090/102] improve dws_flex ux

---
 .../schedmd-slurm-gcp-v6-nodeset/README.md    |  1 +
 .../schedmd-slurm-gcp-v6-nodeset/main.tf      |  1 +
 .../schedmd-slurm-gcp-v6-nodeset/outputs.tf   |  9 +++++++
 .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 27 +++++++++++++++++++
 .../schedmd-slurm-gcp-v6-controller/README.md |  2 +-
 .../modules/slurm_files/scripts/resume.py     |  8 ++++++
 .../partition.tf                              |  1 +
 .../variables.tf                              |  4 +++
 docs/slurm-dws-flex.md                        | 18 +++++--------
 9 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
index 117e0ca0e5..115ac451e7 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md
@@ -169,6 +169,7 @@ No modules.
 | <a name="input_disk_labels"></a> [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no |
 | <a name="input_disk_size_gb"></a> [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no |
 | <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no |
+| <a name="input_dws_flex"></a> [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.<br/> See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler<br/> Options:<br/> - enable: Enable DWS Flex Start<br/> - max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).<br/><br/>Limitations:<br/> - CAN NOT be used with reservations;<br/> - CAN NOT be used with placement groups; | <pre>object({<br/>    enabled          = optional(bool, true)<br/>    max_run_duration = optional(number, 1209600) # 2 weeks<br/>  })</pre> | <pre>{<br/>  "enabled": false<br/>}</pre> | no |
 | <a name="input_enable_confidential_vm"></a> [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no |
 | <a name="input_enable_maintenance_reservation"></a> [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no |
 | <a name="input_enable_oslogin"></a> [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.<br/>See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no |
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
index 224ca76f80..217328277b 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf
@@ -57,6 +57,7 @@ locals {
     node_count_dynamic_max = var.node_count_dynamic_max
     node_conf              = var.node_conf
     nodeset_name           = local.name
+    dws_flex               = var.dws_flex
 
     disk_auto_delete = var.disk_auto_delete
     disk_labels      = merge(local.labels, var.disk_labels)
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
index dc2f3b0c40..671d542584 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf
@@ -44,4 +44,13 @@ output "nodeset" {
     condition     = !var.enable_placement || var.node_count_static == 0 || var.node_count_dynamic_max == 0
     error_message = "Cannot use placement with static and auto-scaling nodes in the same node set."
   }
+  precondition {
+    condition     = var.reservation_name == "" || !var.dws_flex.enabled
+    error_message = "Cannot use reservations with DWS Flex."
+  }
+
+  precondition {
+    condition     = !var.enable_placement || !var.dws_flex.enabled
+    error_message = "Cannot use DWS Flex with `enable_placement`."
+  }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
index aeb2435bd0..536659f136 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf
@@ -512,3 +512,30 @@ variable "enable_maintenance_reservation" {
   description = "Enables slurm reservation for scheduled maintenance."
   default     = false
 }
+
+variable "dws_flex" {
+  description = <<-EOD
+  If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
+  See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
+  Options:
+  - enable: Enable DWS Flex Start
+  - max_run_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).
+  
+ Limitations:
+  - CAN NOT be used with reservations;
+  - CAN NOT be used with placement groups;
+
+ EOD
+
+  type = object({
+    enabled          = optional(bool, true)
+    max_run_duration = optional(number, 1209600) # 2 weeks
+  })
+  default = {
+    enabled = false
+  }
+  validation {
+    condition     = var.dws_flex.max_run_duration >= 30 && var.dws_flex.max_run_duration <= 1209600
+    error_message = "Max duration must be more than 30 seconds, and cannot be more than two weeks."
+  }
+}
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index a9d801d8c7..1720eb67a5 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -313,7 +313,7 @@ limitations under the License.
 | <a name="input_metadata"></a> [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no |
 | <a name="input_min_cpu_platform"></a> [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of<br/>CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:<br/>https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no |
 | <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. | <pre>list(object({<br/>    server_ip             = string,<br/>    remote_mount          = string,<br/>    local_mount           = string,<br/>    fs_type               = string,<br/>    mount_options         = string,<br/>    client_install_runner = optional(map(string))<br/>    mount_runner          = optional(map(string))<br/>  }))</pre> | `[]` | no |
-| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Define nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 1)<br/>    node_conf              = optional(map(string), {})<br/>    nodeset_name           = string<br/>    additional_disks = optional(list(object({<br/>      disk_name    = optional(string)<br/>      device_name  = optional(string)<br/>      disk_size_gb = optional(number)<br/>      disk_type    = optional(string)<br/>      disk_labels  = optional(map(string), {})<br/>      auto_delete  = optional(bool, true)<br/>      boot         = optional(bool, false)<br/>    })), [])<br/>    bandwidth_tier                 = optional(string, "platform_default")<br/>    can_ip_forward                 = optional(bool, false)<br/>    disable_smt                    = optional(bool, false)<br/>    disk_auto_delete               = optional(bool, true)<br/>    disk_labels                    = optional(map(string), {})<br/>    disk_size_gb                   = optional(number)<br/>    disk_type                      = optional(string)<br/>    enable_confidential_vm         = optional(bool, false)<br/>    enable_placement               = optional(bool, false)<br/>    enable_oslogin                 = optional(bool, true)<br/>    enable_shielded_vm             = optional(bool, false)<br/>    enable_maintenance_reservation = optional(bool, true)<br/>    gpu = optional(object({<br/>      count = number<br/>      type  = string<br/>    }))<br/>    labels                   = optional(map(string), {})<br/>    machine_type             = optional(string)<br/>    maintenance_interval     = optional(string)<br/>    instance_properties_json = string<br/>    metadata                 = optional(map(string), {})<br/>    min_cpu_platform         = optional(string)<br/>    network_tier             = optional(string, "STANDARD")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    on_host_maintenance = optional(string)<br/>    preemptible         = optional(bool, false)<br/>    region              = optional(string)<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    shielded_instance_config = optional(object({<br/>      enable_integrity_monitoring = optional(bool, true)<br/>      enable_secure_boot          = optional(bool, true)<br/>      enable_vtpm                 = optional(bool, true)<br/>    }))<br/>    source_image_family  = optional(string)<br/>    source_image_project = optional(string)<br/>    source_image         = optional(string)<br/>    subnetwork_self_link = string<br/>    additional_networks = optional(list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    })))<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })))<br/>    spot               = optional(bool, false)<br/>    tags               = optional(list(string), [])<br/>    termination_action = optional(string)<br/>    reservation_name   = optional(string)<br/>    startup_script = optional(list(object({<br/>      filename = string<br/>    content = string })), [])<br/><br/>    zone_target_shape = string<br/>    zone_policy_allow = set(string)<br/>    zone_policy_deny  = set(string)<br/>  }))</pre> | `[]` | no |
+| <a name="input_nodeset"></a> [nodeset](#input\_nodeset) | Define nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 1)<br/>    node_conf              = optional(map(string), {})<br/>    nodeset_name           = string<br/>    additional_disks = optional(list(object({<br/>      disk_name    = optional(string)<br/>      device_name  = optional(string)<br/>      disk_size_gb = optional(number)<br/>      disk_type    = optional(string)<br/>      disk_labels  = optional(map(string), {})<br/>      auto_delete  = optional(bool, true)<br/>      boot         = optional(bool, false)<br/>    })), [])<br/>    bandwidth_tier                 = optional(string, "platform_default")<br/>    can_ip_forward                 = optional(bool, false)<br/>    disable_smt                    = optional(bool, false)<br/>    disk_auto_delete               = optional(bool, true)<br/>    disk_labels                    = optional(map(string), {})<br/>    disk_size_gb                   = optional(number)<br/>    disk_type                      = optional(string)<br/>    enable_confidential_vm         = optional(bool, false)<br/>    enable_placement               = optional(bool, false)<br/>    enable_oslogin                 = optional(bool, true)<br/>    enable_shielded_vm             = optional(bool, false)<br/>    enable_maintenance_reservation = optional(bool, true)<br/>    gpu = optional(object({<br/>      count = number<br/>      type  = string<br/>    }))<br/>    dws_flex = object({<br/>      enabled          = bool<br/>      max_run_duration = number<br/>    })<br/>    labels                   = optional(map(string), {})<br/>    machine_type             = optional(string)<br/>    maintenance_interval     = optional(string)<br/>    instance_properties_json = string<br/>    metadata                 = optional(map(string), {})<br/>    min_cpu_platform         = optional(string)<br/>    network_tier             = optional(string, "STANDARD")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    on_host_maintenance = optional(string)<br/>    preemptible         = optional(bool, false)<br/>    region              = optional(string)<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    shielded_instance_config = optional(object({<br/>      enable_integrity_monitoring = optional(bool, true)<br/>      enable_secure_boot          = optional(bool, true)<br/>      enable_vtpm                 = optional(bool, true)<br/>    }))<br/>    source_image_family  = optional(string)<br/>    source_image_project = optional(string)<br/>    source_image         = optional(string)<br/>    subnetwork_self_link = string<br/>    additional_networks = optional(list(object({<br/>      network            = string<br/>      subnetwork         = string<br/>      subnetwork_project = string<br/>      network_ip         = string<br/>      nic_type           = string<br/>      stack_type         = string<br/>      queue_count        = number<br/>      access_config = list(object({<br/>        nat_ip       = string<br/>        network_tier = string<br/>      }))<br/>      ipv6_access_config = list(object({<br/>        network_tier = string<br/>      }))<br/>      alias_ip_range = list(object({<br/>        ip_cidr_range         = string<br/>        subnetwork_range_name = string<br/>      }))<br/>    })))<br/>    access_config = optional(list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    })))<br/>    spot               = optional(bool, false)<br/>    tags               = optional(list(string), [])<br/>    termination_action = optional(string)<br/>    reservation_name   = optional(string)<br/>    startup_script = optional(list(object({<br/>      filename = string<br/>    content = string })), [])<br/><br/>    zone_target_shape = string<br/>    zone_policy_allow = set(string)<br/>    zone_policy_deny  = set(string)<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_dyn"></a> [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. | <pre>list(object({<br/>    nodeset_name    = string<br/>    nodeset_feature = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_nodeset_tpu"></a> [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. | <pre>list(object({<br/>    node_count_static      = optional(number, 0)<br/>    node_count_dynamic_max = optional(number, 5)<br/>    nodeset_name           = string<br/>    enable_public_ip       = optional(bool, false)<br/>    node_type              = string<br/>    accelerator_config = optional(object({<br/>      topology = string<br/>      version  = string<br/>      }), {<br/>      topology = ""<br/>      version  = ""<br/>    })<br/>    tf_version   = string<br/>    preemptible  = optional(bool, false)<br/>    preserve_tpu = optional(bool, false)<br/>    zone         = string<br/>    data_disks   = optional(list(string), [])<br/>    docker_image = optional(string, "")<br/>    network_storage = optional(list(object({<br/>      server_ip             = string<br/>      remote_mount          = string<br/>      local_mount           = string<br/>      fs_type               = string<br/>      mount_options         = string<br/>      client_install_runner = optional(map(string))<br/>      mount_runner          = optional(map(string))<br/>    })), [])<br/>    subnetwork = string<br/>    service_account = optional(object({<br/>      email  = optional(string)<br/>      scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])<br/>    }))<br/>    project_id = string<br/>    reserved   = optional(string, false)<br/>  }))</pre> | `[]` | no |
 | <a name="input_on_host_maintenance"></a> [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no |
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
index bad3f662f0..1bc1150c58 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py
@@ -98,11 +98,19 @@ def instance_properties(nodeset, model, placement_group, labels=None):
     if nodeset.maintenance_interval:
         props.scheduling.maintenanceInterval = nodeset.maintenance_interval
 
+    if nodeset.dws_flex.enabled:
+        update_props_dws(props,nodeset.dws_flex)
+
     # Override with properties explicit specified in the nodeset
     props.update(nodeset.get("instance_properties") or {})
     
     return props
 
+def update_props_dws(props:dict,dws_flex:dict) -> None:
+    props.scheduling.onHostMaintenance = "TERMINATE"
+    props.scheduling.instanceTerminationAction = "DELETE"
+    props.scheduling.maxRunDuration['seconds'] = dws_flex.max_run_duration
+    props.reservationAffinity['consumeReservationType'] = "NO_RESERVATION"
 
 def per_instance_properties(node):
     props = NSDict()
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index 849844808a..7254551072 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -83,6 +83,7 @@ locals {
   nodesets = [for name, ns in local.nodeset_map : {
     nodeset_name                   = ns.nodeset_name
     node_conf                      = ns.node_conf
+    dws_flex                       = ns.dws_flex
     instance_template              = module.slurm_nodeset_template[ns.nodeset_name].self_link
     node_count_dynamic_max         = ns.node_count_dynamic_max
     node_count_static              = ns.node_count_static
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
index 2fc7bebb4b..95e5c20d0a 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf
@@ -212,6 +212,10 @@ variable "nodeset" {
       count = number
       type  = string
     }))
+    dws_flex = object({
+      enabled          = bool
+      max_run_duration = number
+    })
     labels                   = optional(map(string), {})
     machine_type             = optional(string)
     maintenance_interval     = optional(string)
diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md
index dfa65b6015..8b1c38bb01 100644
--- a/docs/slurm-dws-flex.md
+++ b/docs/slurm-dws-flex.md
@@ -13,25 +13,19 @@ With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity re
 > The project needs to be allowlisted for private preview access.
 > Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit).
 
-In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below:
+In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. See the example below:
 
 ```yaml
   - id: flex_nodeset
     source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
     use: [network]
     settings:
-      instance_properties:
-        reservationAffinity:
-          consumeReservationType: NO_RESERVATION
-        scheduling:
-          maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours
-          onHostMaintenance: TERMINATE
-          instanceTerminationAction: DELETE
+      dws_flex:
+        max_run_duration: 3600 # 1 hour
+      enable_placement: false
       # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc.
 ```
 
-**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days).
-
 > [!WARNING]
-> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample
-> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API.
+> DWS Flex Start cannot be used in tandem with a reservation or placement policy
+> While this feature was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API.

From 2fa185ccf920f53017374284599d329778ca0f26 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Tue, 15 Oct 2024 13:30:26 -0500
Subject: [PATCH 091/102] Update Slurm-GCP to 6.8.2

Brings in new default NVIDIA driver 550.90.12 which solves several known
issues, including NCCL Timeout errors.

https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-12/index.html
---
 community/examples/hpc-build-slurm-image.yaml  |  2 +-
 .../README.md                                  |  2 +-
 .../main.tf                                    |  2 +-
 .../schedmd-slurm-gcp-v6-controller/README.md  | 18 +++++++++---------
 .../controller.tf                              |  4 ++--
 .../schedmd-slurm-gcp-v6-controller/login.tf   |  4 ++--
 .../partition.tf                               |  4 ++--
 .../schedmd-slurm-gcp-v6-login/README.md       |  8 ++++----
 .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml     |  2 +-
 .../a3-megagpu-8g/slurm-a3mega-image.yaml      |  2 +-
 modules/README.md                              |  2 +-
 11 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml
index 45e6bd1612..a1fa81767e 100644
--- a/community/examples/hpc-build-slurm-image.yaml
+++ b/community/examples/hpc-build-slurm-image.yaml
@@ -23,7 +23,7 @@ vars:
   image_build_machine_type: n2d-standard-16
   build_from_image_family: hpc-rocky-linux-8
   build_from_image_project: cloud-hpc-image-public
-  build_from_git_ref: 6.7.0
+  build_from_git_ref: 6.8.2
   built_image_family: my-custom-slurm
   built_instance_image:
     family: $(vars.built_image_family)
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
index 4d790fe703..d251dff2af 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md
@@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 |
 
 ## Resources
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
index 3f0ee54af8..7ca868a049 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf
@@ -56,7 +56,7 @@ locals {
 }
 
 module "slurm_nodeset_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2"
 
   project_id          = var.project_id
   region              = var.region
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
index 37b5da93da..9f4933a1fa 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md
@@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and
 enhancing the Slurm on GCP cluster as well as recommendations on configuring the
 controller for optimal performance at different scales.
 
-[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0
-[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_controller_instance
-[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template
+[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2
+[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_controller_instance
+[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template
 [slurm-ug]: https://goo.gle/slurm-gcp-user-guide.
 [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute
 [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions
@@ -238,13 +238,13 @@ limitations under the License.
 | <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 |
 | <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
 | <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
-| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 |
-| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
+| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 |
+| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 |
 | <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
-| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 |
-| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
-| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 |
-| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.1 |
+| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 |
+| <a name="module_slurm_login_template"></a> [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 |
+| <a name="module_slurm_nodeset_template"></a> [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 |
+| <a name="module_slurm_nodeset_tpu"></a> [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.2 |
 
 ## Resources
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
index 9b105d7f39..1ce6ed158f 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf
@@ -43,7 +43,7 @@ locals {
 
 # INSTANCE TEMPLATE
 module "slurm_controller_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2"
 
   project_id          = var.project_id
   region              = var.region
@@ -99,7 +99,7 @@ locals {
 }
 
 module "slurm_controller_instance" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2"
 
   access_config       = var.enable_controller_public_ips ? [local.access_config] : []
   add_hostname_suffix = false
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
index de97810316..998a8e0867 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf
@@ -14,7 +14,7 @@
 
 # TEMPLATE
 module "slurm_login_template" {
-  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
+  source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2"
 
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
@@ -56,7 +56,7 @@ module "slurm_login_template" {
 
 # INSTANCE
 module "slurm_login_instance" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2"
   for_each = { for x in var.login_nodes : x.name_prefix => x }
 
   access_config       = each.value.access_config
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
index 7254551072..0d05c71f91 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf
@@ -26,7 +26,7 @@ locals {
 # NODESET
 # TODO: remove dependency on slurm-gcp repo, move to local template module
 module "slurm_nodeset_template" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2"
   for_each = local.nodeset_map
 
   project_id          = var.project_id
@@ -102,7 +102,7 @@ locals {
 
 # NODESET TPU
 module "slurm_nodeset_tpu" {
-  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.1"
+  source   = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.2"
   for_each = local.nodeset_tpu_map
 
   project_id             = var.project_id
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
index 0afd0bfee7..4ad20a6352 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md
@@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the
 terraform modules. The login node is used in conjunction with the
 [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md).
 
-[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0
-[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_login_instance
-[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template
+[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2
+[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_login_instance
+[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template
 
 ### Example
 
@@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the
 [slurm-gcp README][slurm-gcp-readme].
 
 [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7
-[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0#slurm-on-google-cloud-platform
+[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2#slurm-on-google-cloud-platform
 
 <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 ## Requirements
diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
index b817972331..705e1299eb 100644
--- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
+++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml
@@ -94,7 +94,7 @@ deployment_groups:
           set -e -o pipefail
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml
diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
index 67f33cde7d..dfc4d4ab4c 100644
--- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
@@ -108,7 +108,7 @@ deployment_groups:
           apt-get install -y git
           ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents
           ansible-pull \
-              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \
+              -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \
               -i localhost, --limit localhost --connection=local \
               -e @/var/tmp/slurm_vars.json \
               ansible/playbook.yml
diff --git a/modules/README.md b/modules/README.md
index b0575ec3db..722449e6e6 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -230,7 +230,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca
 [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
 [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
 [slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0
-[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0
+[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2
 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md
 [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md
 

From 786b5c27e7286281e501089ed390ffebd920370a Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Tue, 15 Oct 2024 18:41:12 +0000
Subject: [PATCH 092/102] Fix deprecation link and add deprecation notice in v5
 compute modules

---
 .../compute/schedmd-slurm-gcp-v5-node-group/README.md       | 5 +++++
 .../schedmd-slurm-gcp-v5-partition-dynamic/README.md        | 5 +++++
 .../compute/schedmd-slurm-gcp-v5-partition/README.md        | 5 +++++
 .../scheduler/schedmd-slurm-gcp-v5-controller/README.md     | 6 ++----
 .../modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md | 6 ++----
 .../modules/scheduler/schedmd-slurm-gcp-v5-login/README.md  | 6 ++----
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
index d4cc3fcda3..bc54d36396 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md
@@ -1,5 +1,10 @@
 ## Description
 
+> [!NOTE]
+> Slurm-gcp-v5-node-group module is deprecated. See
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
+
 This module creates a node group data structure intended to be input to the
 [schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/) module.
 
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
index f7ad53f382..cecea973e1 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md
@@ -1,5 +1,10 @@
 ## Description
 
+> [!NOTE]
+> Slurm-gcp-v5-partition-dynamic module is deprecated. See
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
+
 This module creates a dynamic compute partition that can be used as input to the
 [schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md).
 This will configure the slurm partition to contain nodes with the corresponding feature.
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
index 1ae1d0b50f..f9fcc59bed 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md
@@ -1,5 +1,10 @@
 ## Description
 
+> [!NOTE]
+> Slurm-gcp-v5-partition module is deprecated. See
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
+
 This module creates a compute partition that can be used as input to the
 [schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md).
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
index b2a1bb503e..b9ae2ce50c 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md
@@ -2,10 +2,8 @@
 
 > [!NOTE]
 > Slurm-gcp-v5-controller module is deprecated. See
-> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
-> and timelines.
-
-* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
 
 This module creates a slurm controller node via the [SchedMD/slurm-gcp]
 [slurm\_controller\_instance] and [slurm\_instance\_template] modules.
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
index 82fa0b9771..56cbc33b07 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md
@@ -2,10 +2,8 @@
 
 > [!NOTE]
 > Slurm-gcp-v5-hybrid module is deprecated. See
-> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
-> and timelines.
-
-* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
 
 This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD
 as part of the [slurm-gcp] github repository. The hybrid module serves to create
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
index 80d969ade6..44b337ec78 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md
@@ -2,10 +2,8 @@
 
 > [!NOTE]
 > Slurm-gcp-v5-login module is deprecated. See
-> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations
-> and timelines.
-
-* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6)
+> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6)
+> for specific recommendations and timelines.
 
 This module creates a login node for a Slurm cluster based on the
 [SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance]

From 0eb20fe431546ba6c208321c8e5fe81cc3b8d88c Mon Sep 17 00:00:00 2001
From: chengcongdu <chdu@google.com>
Date: Tue, 15 Oct 2024 22:32:45 +0000
Subject: [PATCH 093/102] fix readme

---
 modules/compute/gke-node-pool/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
index e6d3bdb42c..880e1834e4 100644
--- a/modules/compute/gke-node-pool/README.md
+++ b/modules/compute/gke-node-pool/README.md
@@ -277,7 +277,7 @@ limitations under the License.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br>    network            = string<br>    subnetwork         = string<br>    subnetwork_project = string<br>    network_ip         = string<br>    nic_type           = string<br>    stack_type         = string<br>    queue_count        = number<br>    access_config = list(object({<br>      nat_ip       = string<br>      network_tier = string<br>    }))<br>    ipv6_access_config = list(object({<br>      network_tier = string<br>    }))<br>    alias_ip_range = list(object({<br>      ip_cidr_range         = string<br>      subnetwork_range_name = string<br>    }))<br>  }))</pre> | `[]` | no |
+| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
@@ -288,27 +288,27 @@ limitations under the License.
 | <a name="input_enable_gcfs"></a> [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no |
 | <a name="input_enable_secure_boot"></a> [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes.  Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no |
 | <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes |
-| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br>    type  = optional(string)<br>    count = optional(number, 0)<br>    gpu_driver_installation_config = optional(list(object({<br>      gpu_driver_version = string<br>    })))<br>    gpu_partition_size = optional(string)<br>    gpu_sharing_config = optional(list(object({<br>      gpu_sharing_strategy       = optional(string)<br>      max_shared_clients_per_gpu = optional(number)<br>    })))<br>  }))</pre> | `null` | no |
+| <a name="input_guest_accelerator"></a> [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. | <pre>list(object({<br/>    type  = optional(string)<br/>    count = optional(number, 0)<br/>    gpu_driver_installation_config = optional(list(object({<br/>      gpu_driver_version = string<br/>    })))<br/>    gpu_partition_size = optional(string)<br/>    gpu_sharing_config = optional(list(object({<br/>      gpu_sharing_strategy       = optional(string)<br/>      max_shared_clients_per_gpu = optional(number)<br/>    })))<br/>  }))</pre> | `null` | no |
 | <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no |
 | <a name="input_image_type"></a> [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no |
 | <a name="input_initial_node_count"></a> [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no |
 | <a name="input_internal_ghpc_module_id"></a> [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide). | `string` | n/a | yes |
 | <a name="input_kubernetes_labels"></a> [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs. <br/>(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no |
 | <a name="input_labels"></a> [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes |
-| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
-| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br>Uses NVMe interfaces.  Must be supported by `machine_type`.<br>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_ephemeral_storage"></a> [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
+| <a name="input_local_ssd_count_nvme_block"></a> [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.<br/>Uses NVMe interfaces.  Must be supported by `machine_type`.<br/>When set to null,  default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.<br/>[See above](#local-ssd-storage) for more info. | `number` | `null` | no |
 | <a name="input_machine_type"></a> [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no |
 | <a name="input_name"></a> [name](#input\_name) | The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix.<br/>If setting manually, ensure a unique value across all gke-node-pools. | `string` | `null` | no |
 | <a name="input_placement_policy"></a> [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.<br/>It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.<br/>Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | <pre>object({<br/>    type = string<br/>    name = optional(string)<br/>  })</pre> | <pre>{<br/>  "name": null,<br/>  "type": null<br/>}</pre> | no |
 | <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
-| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br>It is assumed that the specified reservation exists and has available capacity.<br>For a shared reservation, specify the project\_id as well in which it was created.<br>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br>    consume_reservation_type = string<br>    specific_reservations = optional(list(object({<br>      name    = string<br>      project = optional(string)<br>    })))<br>  })</pre> | <pre>{<br>  "consume_reservation_type": "NO_RESERVATION",<br>  "specific_reservations": []<br>}</pre> | no |
-| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br>    email  = string,<br>    scopes = set(string)<br>  })</pre> | `null` | no |
+| <a name="input_reservation_affinity"></a> [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.<br/>Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.<br/>It is assumed that the specified reservation exists and has available capacity.<br/>For a shared reservation, specify the project\_id as well in which it was created.<br/>To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared | <pre>object({<br/>    consume_reservation_type = string<br/>    specific_reservations = optional(list(object({<br/>      name    = string<br/>      project = optional(string)<br/>    })))<br/>  })</pre> | <pre>{<br/>  "consume_reservation_type": "NO_RESERVATION",<br/>  "specific_reservations": []<br/>}</pre> | no |
+| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. | <pre>object({<br/>    email  = string,<br/>    scopes = set(string)<br/>  })</pre> | `null` | no |
 | <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no |
-| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
+| <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` | <pre>[<br/>  "https://www.googleapis.com/auth/cloud-platform"<br/>]</pre> | no |
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_static_node_count"></a> [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no |
-| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br>    key    = string<br>    value  = any<br>    effect = string<br>  }))</pre> | <pre>[<br>  {<br>    "effect": "NO_SCHEDULE",<br>    "key": "user-workload",<br>    "value": true<br>  }<br>]</pre> | no |
-| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br><br>The default value of \"0\" will turn off SMT for supported machine types, and<br>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br>instances, or instances with less than 2 vCPU).<br><br>Disabling SMT can be more performant in many HPC workloads, therefore it is<br>disabled by default where compatible.<br><br>null = SMT configuration will use the GCE defaults for the machine type<br>0 = SMT will be disabled where compatible (default)<br>1 = SMT will always be disabled (will fail on incompatible machine types)<br>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
+| <a name="input_taints"></a> [taints](#input\_taints) | Taints to be applied to the system node pool. | <pre>list(object({<br/>    key    = string<br/>    value  = any<br/>    effect = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NO_SCHEDULE",<br/>    "key": "user-workload",<br/>    "value": true<br/>  }<br/>]</pre> | no |
+| <a name="input_threads_per_core"></a> [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core<br/>to 2, Simultaneous Multithreading (SMT) is enabled extending the total number<br/>of virtual cores. For example, a machine of type c2-standard-60 will have 60<br/>virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal<br/>to 1 (SMT turned off), only the 30 physical cores will be available on the VM.<br/><br/>The default value of \"0\" will turn off SMT for supported machine types, and<br/>will fall back to GCE defaults for unsupported machine types (t2d, shared-core<br/>instances, or instances with less than 2 vCPU).<br/><br/>Disabling SMT can be more performant in many HPC workloads, therefore it is<br/>disabled by default where compatible.<br/><br/>null = SMT configuration will use the GCE defaults for the machine type<br/>0 = SMT will be disabled where compatible (default)<br/>1 = SMT will always be disabled (will fail on incompatible machine types)<br/>2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no |
 | <a name="input_timeout_create"></a> [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no |
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |

From d8306aed8bcc96aa0dc4a58425399a6ba9f1d27a Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Tue, 15 Oct 2024 19:07:40 -0700
Subject: [PATCH 094/102] Make spack and ramble bucket names look like
 startup-script bucket names

---
 community/modules/scripts/ramble-setup/main.tf | 2 +-
 community/modules/scripts/spack-setup/main.tf  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf
index b1b445fc22..8a5efddfe6 100644
--- a/community/modules/scripts/ramble-setup/main.tf
+++ b/community/modules/scripts/ramble-setup/main.tf
@@ -73,7 +73,7 @@ locals {
   }
 
   bucket_md5  = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8)
-  bucket_name = "ramble-scripts-${local.bucket_md5}"
+  bucket_name = "${var.deployment_name}-ramble-scripts-${local.bucket_md5}"
   runners     = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner]
 
   combined_runner = {
diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf
index 6a1eb21312..eff6a8f9b8 100644
--- a/community/modules/scripts/spack-setup/main.tf
+++ b/community/modules/scripts/spack-setup/main.tf
@@ -80,7 +80,7 @@ locals {
   }
 
   bucket_md5  = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8)
-  bucket_name = "spack-scripts-${local.bucket_md5}"
+  bucket_name = "${var.deployment_name}-spack-scripts-${local.bucket_md5}"
   runners     = [local.install_spack_deps_runner, local.install_spack_runner]
 
   combined_runner = {

From 8e57bcc3ca5127c7222934560ee9317927ee3a63 Mon Sep 17 00:00:00 2001
From: annuay <annuay@google.com>
Date: Wed, 16 Oct 2024 19:44:05 +0000
Subject: [PATCH 095/102] share SA across node pools

---
 examples/storage-gke.yaml                     | 14 +------
 .../daily-tests/blueprints/ml-gke-e2e.yaml    | 42 ++-----------------
 2 files changed, 4 insertions(+), 52 deletions(-)

diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml
index 00c3d60290..a257f97c49 100644
--- a/examples/storage-gke.yaml
+++ b/examples/storage-gke.yaml
@@ -50,18 +50,6 @@ deployment_groups:
       - storage.objectViewer
       - artifactregistry.reader
 
-  - id: local_ssd_pool_service_account
-    source: community/modules/project/service-account
-    settings:
-      name: ssd-sa
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - monitoring.viewer
-      - stackdriver.resourceMetadata.writer
-      - storage.objectViewer
-      - artifactregistry.reader
-
   - id: gke_cluster
     source: modules/scheduler/gke-cluster
     use: [network1, gke_service_account]
@@ -142,7 +130,7 @@ deployment_groups:
 
   - id: local-ssd-pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, local_ssd_pool_service_account]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: local-ssd
       machine_type: n2d-standard-2
diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
index 20f5ff19f5..d7be384115 100644
--- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
+++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml
@@ -92,21 +92,9 @@ deployment_groups:
       ]
     outputs: [instructions]
 
-  - id: n1_service_account
-    source: community/modules/project/service-account
-    settings:
-      name: n1-sa
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - monitoring.viewer
-      - stackdriver.resourceMetadata.writer
-      - storage.objectViewer
-      - artifactregistry.reader
-
   - id: n1_pool_default
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, n1_service_account]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: n1-pool-default
       disk_type: pd-balanced
@@ -132,21 +120,9 @@ deployment_groups:
       ]
     outputs: [instructions]
 
-  - id: n1_full_service_account
-    source: community/modules/project/service-account
-    settings:
-      name: n1-full-sa
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - monitoring.viewer
-      - stackdriver.resourceMetadata.writer
-      - storage.objectViewer
-      - artifactregistry.reader
-
   - id: n1_pool_full_spec
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, n1_full_service_account]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: n1-pool-full-spec
       disk_type: pd-balanced
@@ -177,21 +153,9 @@ deployment_groups:
       ]
     outputs: [instructions]
 
-  - id: default_settings_service_account
-    source: community/modules/project/service-account
-    settings:
-      name: ds-sa
-      project_roles:
-      - logging.logWriter
-      - monitoring.metricWriter
-      - monitoring.viewer
-      - stackdriver.resourceMetadata.writer
-      - storage.objectViewer
-      - artifactregistry.reader
-
   - id: default_settings_pool
     source: modules/compute/gke-node-pool
-    use: [gke_cluster, default_settings_service_account]
+    use: [gke_cluster, gke_service_account]
     settings:
       name: default-settings-pool
 

From 2a488b81521d59dd70e6ad5eb81251535a7b2d09 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 17 Oct 2024 00:11:53 +0000
Subject: [PATCH 096/102] SlurmGCP. Improve reservation_name parsing logic +
 tests

---
 .../slurm_files/scripts/tests/common.py       |  4 +-
 .../slurm_files/scripts/tests/test_util.py    | 85 ++++++++++++++++++-
 .../modules/slurm_files/scripts/util.py       | 14 +--
 3 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
index 8db9add6c3..2272aeef99 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py
@@ -29,11 +29,13 @@ class Placeholder:
 
 @dataclass
 class TstNodeset:
-    nodeset_name: str
+    nodeset_name: str = "cantor"
     node_count_static: int = 0
     node_count_dynamic_max: int = 0
     node_conf: dict[str, Any] = field(default_factory=dict)
     instance_template: Optional[str] = None
+    reservation_name: Optional[str] = ""
+    zone_policy_allow: Optional[list[str]] = field(default_factory=list)
 
 @dataclass
 class TstCfg:
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
index 4dd3c8a17b..676b3593aa 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 import pytest
-import common # needed to import util
+from mock import Mock
+from common import TstNodeset, TstCfg # needed to import util
 import util
 from google.api_core.client_options import ClientOptions  # noqa: E402
 
@@ -130,3 +131,85 @@ def test_create_client_options(
     ud_mock.return_value = "googleapis.com"
     ep_mock.return_value = ep_ver
     assert util.create_client_options(api).__repr__() == expected.__repr__()
+
+
+
+@pytest.mark.parametrize(
+        "nodeset,err",
+        [
+            (TstNodeset(reservation_name="projects/x/reservations/y"), AssertionError), # no zones
+            (TstNodeset(
+                reservation_name="projects/x/reservations/y",
+                zone_policy_allow=["eine", "zwei"]), AssertionError), # multiples zones
+            (TstNodeset(
+                reservation_name="robin",
+                zone_policy_allow=["eine"]), ValueError), # invalid name
+            (TstNodeset(
+                reservation_name="projects/reservations/y",
+                zone_policy_allow=["eine"]), ValueError), # invalid name
+            (TstNodeset(
+                reservation_name="projects/x/zones/z/reservations/y",
+                zone_policy_allow=["eine"]), ValueError), # invalid name
+        ]
+)
+def test_nodeset_reservation_err(nodeset, err):
+    lkp = util.Lookup(TstCfg())
+    lkp._get_reservation = Mock()
+    with pytest.raises(err):
+        lkp.nodeset_reservation(nodeset)
+    lkp._get_reservation.assert_not_called()
+    
+@pytest.mark.parametrize(
+        "nodeset,policies,expected",
+        [
+            (TstNodeset(), [], None), # no reservation
+            (TstNodeset(
+                reservation_name="projects/bobin/reservations/robin",
+                zone_policy_allow=["eine"]), 
+                [],
+                util.ReservationDetails(
+                    project="bobin",
+                    zone="eine",
+                    name="robin",
+                    policies=[],
+                    bulk_insert_name="projects/bobin/reservations/robin")),
+            (TstNodeset(
+                reservation_name="projects/bobin/reservations/robin",
+                zone_policy_allow=["eine"]), 
+                ["seven/wanders", "five/red/apples", "yum"],
+                util.ReservationDetails(
+                    project="bobin",
+                    zone="eine",
+                    name="robin",
+                    policies=["wanders", "apples", "yum"],
+                    bulk_insert_name="projects/bobin/reservations/robin")),
+            (TstNodeset(
+                reservation_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6",
+                zone_policy_allow=["eine"]), 
+                [],
+                util.ReservationDetails(
+                    project="bobin",
+                    zone="eine",
+                    name="robin",
+                    policies=[],
+                    reservation_block="cheese-brie-6",
+                    bulk_insert_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6")),
+
+        ])
+
+def test_nodeset_reservation_ok(nodeset, policies, expected):
+    lkp = util.Lookup(TstCfg())
+    lkp._get_reservation = Mock()
+    
+    if not expected:
+        assert lkp.nodeset_reservation(nodeset) is None
+        lkp._get_reservation.assert_not_called()
+        return
+    
+    lkp._get_reservation.return_value = {
+        "resourcePolicies": {i: p for i, p in enumerate(policies)},
+    }
+    assert lkp.nodeset_reservation(nodeset) == expected
+    lkp._get_reservation.assert_called_once_with(expected.project, expected.zone, expected.name)
+    
+    
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index eaf455e8dd..f93c9db4b9 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -1447,8 +1447,10 @@ def delete_node(self, nodename):
 class ReservationDetails:
     project: str
     zone: str
+    name: str
     policies: List[str] # names (not URLs) of resource policies
-    bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name)
+    bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format)
+    reservation_block: Optional[str] = None
 
 class Lookup:
     """Wrapper class for cached data access"""
@@ -1754,13 +1756,13 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]:
         assert len(zones) == 1, "Only single zone is supported if using a reservation"
         zone = zones[0]
 
-        try:
-            _, project, _, name = nodeset.reservation_name.split("/")
-        except ValueError:
+        regex = re.compile(r'^projects/(?P<project>[^/]+)/reservations/(?P<reservation>[^/]+)(/reservationBlocks/(?P<block>[^/]+))?$')
+        if not (match := regex.match(nodeset.reservation_name)):
             raise ValueError(
-                f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'"
+                f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME[/reservationBlocks/BLOCK]'"
             )
         
+        project, name, block = match.group("project", "reservation", "block")
         reservation = self._get_reservation(project, zone, name)
 
         # Converts policy URLs to names, e.g.:
@@ -1770,7 +1772,9 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]:
         return ReservationDetails(
             project=project,
             zone=zone,
+            name=name,
             policies=policies,
+            reservation_block=block,
             bulk_insert_name=nodeset.reservation_name)
 
     @lru_cache(maxsize=1)

From 5835429de35c2314ed2b243626cb1d65c1ef5482 Mon Sep 17 00:00:00 2001
From: Ivan Orlov <orlov@google.com>
Date: Thu, 17 Oct 2024 01:32:49 +0000
Subject: [PATCH 097/102] SlurmGCP. Slurm reservation handling simplification

---
 .../modules/slurm_files/scripts/tests/test_util.py        | 5 ++---
 .../modules/slurm_files/scripts/util.py                   | 8 +++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
index 676b3593aa..14b7a7bf62 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py
@@ -184,7 +184,7 @@ def test_nodeset_reservation_err(nodeset, err):
                     policies=["wanders", "apples", "yum"],
                     bulk_insert_name="projects/bobin/reservations/robin")),
             (TstNodeset(
-                reservation_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6",
+                reservation_name="projects/bobin/reservations/robin/snek/cheese-brie-6",
                 zone_policy_allow=["eine"]), 
                 [],
                 util.ReservationDetails(
@@ -192,8 +192,7 @@ def test_nodeset_reservation_err(nodeset, err):
                     zone="eine",
                     name="robin",
                     policies=[],
-                    reservation_block="cheese-brie-6",
-                    bulk_insert_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6")),
+                    bulk_insert_name="projects/bobin/reservations/robin/snek/cheese-brie-6")),
 
         ])
 
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
index f93c9db4b9..8467e300e2 100755
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py
@@ -1450,7 +1450,6 @@ class ReservationDetails:
     name: str
     policies: List[str] # names (not URLs) of resource policies
     bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format)
-    reservation_block: Optional[str] = None
 
 class Lookup:
     """Wrapper class for cached data access"""
@@ -1756,13 +1755,13 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]:
         assert len(zones) == 1, "Only single zone is supported if using a reservation"
         zone = zones[0]
 
-        regex = re.compile(r'^projects/(?P<project>[^/]+)/reservations/(?P<reservation>[^/]+)(/reservationBlocks/(?P<block>[^/]+))?$')
+        regex = re.compile(r'^projects/(?P<project>[^/]+)/reservations/(?P<reservation>[^/]+)(/.*)?$')
         if not (match := regex.match(nodeset.reservation_name)):
             raise ValueError(
-                f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME[/reservationBlocks/BLOCK]'"
+                f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'"
             )
         
-        project, name, block = match.group("project", "reservation", "block")
+        project, name = match.group("project", "reservation")
         reservation = self._get_reservation(project, zone, name)
 
         # Converts policy URLs to names, e.g.:
@@ -1774,7 +1773,6 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]:
             zone=zone,
             name=name,
             policies=policies,
-            reservation_block=block,
             bulk_insert_name=nodeset.reservation_name)
 
     @lru_cache(maxsize=1)

From d68ee1efbcc08278351044ed2c62b06e7d2f1293 Mon Sep 17 00:00:00 2001
From: Rohit Ramu <roramu@google.com>
Date: Wed, 16 Oct 2024 19:30:00 -0700
Subject: [PATCH 098/102] Don't allow bucket_name to exceed 63 chars

---
 community/modules/scripts/ramble-setup/main.tf | 7 +++++--
 community/modules/scripts/spack-setup/main.tf  | 8 ++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf
index 8a5efddfe6..205c980c03 100644
--- a/community/modules/scripts/ramble-setup/main.tf
+++ b/community/modules/scripts/ramble-setup/main.tf
@@ -72,8 +72,11 @@ locals {
     "destination" = "install_ramble.yml"
   }
 
-  bucket_md5  = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8)
-  bucket_name = "${var.deployment_name}-ramble-scripts-${local.bucket_md5}"
+  bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8)
+  # Max bucket name length is 63, so truncate deployment_name if necessary.
+  #   The string "-ramble-scripts-" is 16 characters and bucket_md5 is 8 characters,
+  #   leaving 63-16-8=39 chars for deployment_name.
+  bucket_name = "${substr(var.deployment_name, 0, 39)}-ramble-scripts-${local.bucket_md5}"
   runners     = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner]
 
   combined_runner = {
diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf
index eff6a8f9b8..b705ccc06c 100644
--- a/community/modules/scripts/spack-setup/main.tf
+++ b/community/modules/scripts/spack-setup/main.tf
@@ -79,8 +79,12 @@ locals {
     "destination" = "install_spack.yml"
   }
 
-  bucket_md5  = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8)
-  bucket_name = "${var.deployment_name}-spack-scripts-${local.bucket_md5}"
+  bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8)
+  # Max bucket name length is 63, so truncate deployment_name if necessary.
+  #   The string "-spack-scripts-" is 15 characters and bucket_md5 is 8 characters,
+  #   leaving 63-15-8=40 chars for deployment_name.  Using 39 so it has the same prefix as the
+  #   ramble-setup module's GCS bucket.
+  bucket_name = "${substr(var.deployment_name, 0, 39)}-spack-scripts-${local.bucket_md5}"
   runners     = [local.install_spack_deps_runner, local.install_spack_runner]
 
   combined_runner = {

From 6d412e7f6ce23e3e4cb8d5af0ea21b9c72ea7d62 Mon Sep 17 00:00:00 2001
From: Rachael Tamakloe <rtamakloe@google.com>
Date: Tue, 15 Oct 2024 18:49:38 +0000
Subject: [PATCH 099/102] updating version constraint

---
 pkg/config/expand.go                                      | 4 ++--
 pkg/config/expand_test.go                                 | 4 ++--
 .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml       | 4 ++--
 .../golden_copies/expectations/igc_pkr/zero/versions.tf   | 4 ++--
 .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml        | 8 ++++----
 .../golden_copies/expectations/igc_tf/one/versions.tf     | 4 ++--
 .../golden_copies/expectations/igc_tf/zero/versions.tf    | 4 ++--
 .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++--
 .../expectations/merge_flatten/zero/versions.tf           | 4 ++--
 .../.ghpc/artifacts/expanded_blueprint.yaml               | 4 ++--
 .../expectations/versioned_blueprint/primary/versions.tf  | 4 ++--
 11 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pkg/config/expand.go b/pkg/config/expand.go
index a58ce74a41..b79babfbe5 100644
--- a/pkg/config/expand.go
+++ b/pkg/config/expand.go
@@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider {
 	return map[string]TerraformProvider{
 		"google": {
 			Source:        "hashicorp/google",
-			Version:       ">= 4.84.0, < 6.7.0",
+			Version:       ">= 4.84.0, < 6.8.0",
 			Configuration: gglConf},
 		"google-beta": {
 			Source:        "hashicorp/google-beta",
-			Version:       ">= 4.84.0, < 6.7.0",
+			Version:       ">= 4.84.0, < 6.8.0",
 			Configuration: gglConf}}
 }
 
diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go
index 59495832d4..5abdd6620d 100644
--- a/pkg/config/expand_test.go
+++ b/pkg/config/expand_test.go
@@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) {
 		c.Check(g.TerraformProviders, DeepEquals, map[string]PR{
 			"google": TerraformProvider{
 				Source:  "hashicorp/google",
-				Version: ">= 4.84.0, < 6.7.0"},
+				Version: ">= 4.84.0, < 6.8.0"},
 			"google-beta": TerraformProvider{
 				Source:  "hashicorp/google-beta",
-				Version: ">= 4.84.0, < 6.7.0"}})
+				Version: ">= 4.84.0, < 6.8.0"}})
 	}
 
 	{ // no def PR, group PR
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
index ba265ba2ee..c3f9926b11 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml
@@ -38,14 +38,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
index 3534fd124e..3dd3e12681 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
index 5736fbba16..d9c215a457 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml
@@ -44,14 +44,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
@@ -79,14 +79,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
index 3534fd124e..3dd3e12681 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
index 3534fd124e..3dd3e12681 100644
--- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
index c21a1bb32f..46614b02e6 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml
@@ -39,14 +39,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
index 3534fd124e..3dd3e12681 100644
--- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
   }
 }
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
index ad79aee614..2c5e9ca64a 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml
@@ -47,14 +47,14 @@ deployment_groups:
     terraform_providers:
       google:
         source: hashicorp/google
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
           zone: ((var.zone))
       google-beta:
         source: hashicorp/google-beta
-        version: '>= 4.84.0, < 6.7.0'
+        version: '>= 4.84.0, < 6.8.0'
         configuration:
           project: ((var.project_id))
           region: ((var.region))
diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
index 3534fd124e..3dd3e12681 100644
--- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
+++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf
@@ -20,11 +20,11 @@ terraform {
   required_providers {
     google = {
       source  = "hashicorp/google"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
     google-beta = {
       source  = "hashicorp/google-beta"
-      version = ">= 4.84.0, < 6.7.0"
+      version = ">= 4.84.0, < 6.8.0"
     }
   }
 }

From e3293120867aad46b961fd0328dd2adf96a40616 Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 17 Oct 2024 17:10:14 -0500
Subject: [PATCH 100/102] Refactor default value for mountpoint in local SSD
 solution

---
 modules/scripts/startup-script/files/setup-raid.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml
index 429d2b5594..585c6023f3 100644
--- a/modules/scripts/startup-script/files/setup-raid.yml
+++ b/modules/scripts/startup-script/files/setup-raid.yml
@@ -23,6 +23,7 @@
     fstype: ext4
     interface: nvme
     mode: '0755'
+    mountpoint: /mnt/{{ raid_name }}
   tasks:
   - name: Get local SSD devices
     ansible.builtin.find:
@@ -61,7 +62,7 @@
   - name: Mount RAID array
     ansible.posix.mount:
       src: "{{ array_dev }}"
-      path: '{{ mountpoint | default("/mnt/" + raid_name) }}'
+      path: "{{ mountpoint }}"
       fstype: "{{ fstype }}"
       # the nofail option is critical as it enables the boot process to complete on machines
       # that were powered off and had local SSD contents discarded; without this option
@@ -71,6 +72,6 @@
 
   - name: Set mount permissions
     ansible.builtin.file:
-      path: '{{ mountpoint | default("/mnt/" + raid_name) }}'
+      path: "{{ mountpoint }}"
       state: directory
       mode: "{{ mode }}"

From fa3f3a6271fa4b7126f4b783fb34eae6cbb5456d Mon Sep 17 00:00:00 2001
From: Tom Downes <tpdownes@google.com>
Date: Thu, 17 Oct 2024 17:10:14 -0500
Subject: [PATCH 101/102] Ensure local SSD solutions works upon reboot of Slurm
 nodes

When the local SSD mountpoint has not been mounted use SystemD to create
the RAID array and format it. This addresses the known behavior of the
Slurm-GCP solution in which it does not re-run startup-scripts upon
a power off/on (or reboot) cycle. During a typical power off/on cycle,
the local SSD contents are discarded and the disks must be re-assembled
and formatted.
---
 .../startup-script/files/setup-raid.yml       | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml
index 585c6023f3..d7590069a8 100644
--- a/modules/scripts/startup-script/files/setup-raid.yml
+++ b/modules/scripts/startup-script/files/setup-raid.yml
@@ -41,23 +41,34 @@
       name: mdadm
       state: present
 
-  - name: Force RAID array if only 1 local SSD
-    ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices=1 /dev/disk/by-id/google-local-nvme-ssd-0 --force
-    args:
-      creates: "{{ array_dev }}"
-    when: local_ssd_devices.files | length == 1
+  # this service will act during the play and upon reboots to ensure that local
+  # SSD volumes are always assembled into a RAID and re-formatted if necessary;
+  # there are many scenarios where a VM can be stopped or migrated during
+  # maintenance and the contents of local SSD will be discarded
+  - name: Install service to create local SSD RAID and format it
+    ansible.builtin.copy:
+      dest: /etc/systemd/system/create-localssd-raid.service
+      mode: 0644
+      content: |
+        [Unit]
+        After=local-fs.target
+        Before=slurmd.service
+        ConditionPathIsMountPoint=!{{ mountpoint }}
 
-  - name: Create RAID array
-    ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*
-    args:
-      creates: "{{ array_dev }}"
-    when: local_ssd_devices.files | length >= 2
+        [Service]
+        Type=oneshot
+        ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}"
+        ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }}
 
-  - name: Format filesystem
-    community.general.filesystem:
-      fstype: "{{ fstype }}"
-      device: "{{ array_dev }}"
-      opts: '{{ "-m 0" if fstype == "ext4" else "" }}'
+        [Install]
+        WantedBy=slurmd.service
+
+  - name: Create RAID array and format
+    ansible.builtin.systemd:
+      name: create-localssd-raid.service
+      state: started
+      enabled: true
+      daemon_reload: true
 
   - name: Mount RAID array
     ansible.posix.mount:

From 06a71f424dba5e6944383411751869bbbacce8c1 Mon Sep 17 00:00:00 2001
From: Harsh Thakkar <harshthakkar@google.com>
Date: Wed, 23 Oct 2024 20:51:54 +0000
Subject: [PATCH 102/102] Update version number to v1.41.0 as part of release
 process

---
 cmd/root.go                                                   | 2 +-
 community/examples/tutorial-starccm-slurm.yaml                | 2 +-
 community/modules/compute/htcondor-execute-point/versions.tf  | 2 +-
 community/modules/compute/mig/versions.tf                     | 2 +-
 .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf       | 2 +-
 .../compute/schedmd-slurm-gcp-v5-partition/versions.tf        | 2 +-
 .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf  | 2 +-
 .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf      | 2 +-
 .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf  | 2 +-
 .../compute/schedmd-slurm-gcp-v6-partition/versions.tf        | 2 +-
 .../modules/database/slurm-cloudsql-federation/versions.tf    | 4 ++--
 .../modules/file-system/cloud-storage-bucket/versions.tf      | 2 +-
 community/modules/file-system/nfs-server/versions.tf          | 2 +-
 community/modules/files/fsi-montecarlo-on-batch/versions.tf   | 4 ++--
 community/modules/network/private-service-access/versions.tf  | 4 ++--
 community/modules/project/service-enablement/versions.tf      | 2 +-
 community/modules/pubsub/bigquery-sub/versions.tf             | 4 ++--
 community/modules/pubsub/topic/versions.tf                    | 2 +-
 community/modules/scheduler/htcondor-access-point/versions.tf | 2 +-
 .../modules/scheduler/htcondor-central-manager/versions.tf    | 2 +-
 community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +-
 .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf     | 2 +-
 .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf  | 2 +-
 .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf     | 2 +-
 .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf  | 2 +-
 community/modules/scripts/wait-for-startup/versions.tf        | 2 +-
 community/modules/scripts/windows-startup-script/versions.tf  | 2 +-
 .../hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 +-
 modules/compute/gke-node-pool/versions.tf                     | 2 +-
 modules/compute/vm-instance/versions.tf                       | 4 ++--
 modules/file-system/filestore/versions.tf                     | 4 ++--
 modules/file-system/gke-persistent-volume/versions.tf         | 2 +-
 modules/file-system/gke-storage/versions.tf                   | 2 +-
 modules/monitoring/dashboard/versions.tf                      | 2 +-
 modules/network/firewall-rules/versions.tf                    | 2 +-
 modules/network/pre-existing-subnetwork/versions.tf           | 2 +-
 modules/network/pre-existing-vpc/versions.tf                  | 2 +-
 modules/scheduler/batch-login-node/versions.tf                | 2 +-
 modules/scheduler/gke-cluster/versions.tf                     | 2 +-
 modules/scheduler/pre-existing-gke-cluster/versions.tf        | 2 +-
 modules/scripts/startup-script/versions.tf                    | 2 +-
 41 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/cmd/root.go b/cmd/root.go
index 03717b99d7..e58b8a743d 100644
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
 				logging.Fatal("cmd.Help function failed: %s", err)
 			}
 		},
-		Version:     "v1.40.0",
+		Version:     "v1.41.0",
 		Annotations: annotation,
 	}
 )
diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
index b74eb44d33..9e64014ea7 100644
--- a/community/examples/tutorial-starccm-slurm.yaml
+++ b/community/examples/tutorial-starccm-slurm.yaml
@@ -16,7 +16,7 @@
 
 blueprint_name: starccm-on-slurm
 toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
-toolkit_modules_version: v1.40.0
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## Set GCP Project ID Here ##
diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf
index efd3cab932..3f320827a1 100644
--- a/community/modules/compute/htcondor-execute-point/versions.tf
+++ b/community/modules/compute/htcondor-execute-point/versions.tf
@@ -25,6 +25,6 @@ terraform {
   }
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.41.0"
   }
 }
diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf
index a6e80e8b0e..8e5b3caa45 100644
--- a/community/modules/compute/mig/versions.tf
+++ b/community/modules/compute/mig/versions.tf
@@ -22,6 +22,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:mig/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:mig/v1.41.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
index 1b9cd77ff6..51f49882a1 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.41.0"
   }
   required_version = ">= 1.1"
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
index 6265b12713..4f00828f19 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.41.0"
   }
   required_version = ">= 0.13.0"
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
index 4e98f061c8..9e7273093a 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.41.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
index 66b6296071..f519a18161 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf
@@ -18,6 +18,6 @@ terraform {
   required_version = ">= 1.3"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.41.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
index eef0010b85..242244c5f7 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.41.0"
   }
 }
diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
index 25ec7739f0..17489d3f93 100644
--- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
+++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf
@@ -18,6 +18,6 @@ terraform {
   required_version = ">= 1.3"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.41.0"
   }
 }
diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf
index 7c6a50bb46..1e92271e3a 100644
--- a/community/modules/database/slurm-cloudsql-federation/versions.tf
+++ b/community/modules/database/slurm-cloudsql-federation/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0"
   }
 
   required_version = ">= 0.13.0"
diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf
index e3b1236384..0a6664171a 100644
--- a/community/modules/file-system/cloud-storage-bucket/versions.tf
+++ b/community/modules/file-system/cloud-storage-bucket/versions.tf
@@ -26,7 +26,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.41.0"
   }
   required_version = ">= 0.14.0"
 }
diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf
index d58278f078..5251b527b0 100644
--- a/community/modules/file-system/nfs-server/versions.tf
+++ b/community/modules/file-system/nfs-server/versions.tf
@@ -30,7 +30,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf
index 93e7b8b841..469e310bc0 100644
--- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf
+++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf
@@ -35,9 +35,9 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0"
   }
 }
diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf
index 8b7f0fb043..efb0f8f2d1 100644
--- a/community/modules/network/private-service-access/versions.tf
+++ b/community/modules/network/private-service-access/versions.tf
@@ -30,11 +30,11 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0"
   }
 
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0"
   }
 
   required_version = ">= 1.2"
diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf
index 25f653ee43..974520409d 100644
--- a/community/modules/project/service-enablement/versions.tf
+++ b/community/modules/project/service-enablement/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf
index 66bc2f104c..5597272dca 100644
--- a/community/modules/pubsub/bigquery-sub/versions.tf
+++ b/community/modules/pubsub/bigquery-sub/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0"
   }
   required_version = ">= 1.0"
 }
diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf
index 40c5aedf9d..2a3e2fb59b 100644
--- a/community/modules/pubsub/topic/versions.tf
+++ b/community/modules/pubsub/topic/versions.tf
@@ -27,6 +27,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:topic/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:topic/v1.41.0"
   }
 }
diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf
index cf4956236b..3d452c24bb 100644
--- a/community/modules/scheduler/htcondor-access-point/versions.tf
+++ b/community/modules/scheduler/htcondor-access-point/versions.tf
@@ -26,7 +26,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.41.0"
   }
 
   required_version = ">= 1.1"
diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf
index 8b3a1fc44c..432b506666 100644
--- a/community/modules/scheduler/htcondor-central-manager/versions.tf
+++ b/community/modules/scheduler/htcondor-central-manager/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.41.0"
   }
 
   required_version = ">= 1.1.0"
diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf
index 5ba656f88c..103fe43a30 100644
--- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf
+++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf
@@ -26,7 +26,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.41.0"
   }
 
   required_version = ">= 1.3.0"
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
index f6581de261..d9e1f9b600 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.41.0"
   }
   required_version = ">= 1.1"
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf
index 91831924a1..c52321d462 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.41.0"
   }
   required_version = ">= 1.1"
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
index 3314b7828f..c1fc007bf0 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.41.0"
   }
 }
diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
index 37229bb041..dbcebd21c1 100644
--- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
+++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf
@@ -24,6 +24,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.41.0"
   }
 }
diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf
index 3e9954b5ee..e60ec22c3c 100644
--- a/community/modules/scripts/wait-for-startup/versions.tf
+++ b/community/modules/scripts/wait-for-startup/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf
index 777a8e68ca..1a2aa18a3b 100644
--- a/community/modules/scripts/windows-startup-script/versions.tf
+++ b/community/modules/scripts/windows-startup-script/versions.tf
@@ -16,7 +16,7 @@
 
 terraform {
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
index 0220352d35..813a90f0b6 100644
--- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
+++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml
@@ -16,7 +16,7 @@
 
 blueprint_name: hpc-cluster-hybrid-v5
 toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
-toolkit_modules_version: v1.40.0
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## <<bursting project (Project B)>>
diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf
index 2a27bfc342..0f4cb13c2f 100644
--- a/modules/compute/gke-node-pool/versions.tf
+++ b/modules/compute/gke-node-pool/versions.tf
@@ -30,6 +30,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.41.0"
   }
 }
diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf
index 228e58fe84..2d35e5c50e 100644
--- a/modules/compute/vm-instance/versions.tf
+++ b/modules/compute/vm-instance/versions.tf
@@ -31,10 +31,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0"
   }
 
   required_version = ">= 1.3.0"
diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf
index 593345e994..3454ca00c6 100644
--- a/modules/file-system/filestore/versions.tf
+++ b/modules/file-system/filestore/versions.tf
@@ -26,10 +26,10 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0"
   }
   provider_meta "google-beta" {
-    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf
index c0f5298369..b87efd8a16 100644
--- a/modules/file-system/gke-persistent-volume/versions.tf
+++ b/modules/file-system/gke-persistent-volume/versions.tf
@@ -29,6 +29,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.41.0"
   }
 }
diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf
index 78d62b235d..27f82792ab 100644
--- a/modules/file-system/gke-storage/versions.tf
+++ b/modules/file-system/gke-storage/versions.tf
@@ -16,6 +16,6 @@ terraform {
   required_version = ">= 1.0"
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.41.0"
   }
 }
diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf
index 1db6bd5151..dbf59fa86f 100644
--- a/modules/monitoring/dashboard/versions.tf
+++ b/modules/monitoring/dashboard/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf
index 2daef71bf4..5312b04355 100644
--- a/modules/network/firewall-rules/versions.tf
+++ b/modules/network/firewall-rules/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.41.0"
   }
 
   required_version = ">= 1.3"
diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf
index 8d9b9f0578..7a38f30404 100644
--- a/modules/network/pre-existing-subnetwork/versions.tf
+++ b/modules/network/pre-existing-subnetwork/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf
index 2794d4d0b0..c9f1ec5992 100644
--- a/modules/network/pre-existing-vpc/versions.tf
+++ b/modules/network/pre-existing-vpc/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf
index b58cb8fb08..599294a84e 100644
--- a/modules/scheduler/batch-login-node/versions.tf
+++ b/modules/scheduler/batch-login-node/versions.tf
@@ -22,7 +22,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.41.0"
   }
 
   required_version = ">= 0.14.0"
diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf
index ad17fe1c43..67c30a9e84 100644
--- a/modules/scheduler/gke-cluster/versions.tf
+++ b/modules/scheduler/gke-cluster/versions.tf
@@ -34,6 +34,6 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.41.0"
   }
 }
diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf
index 3ad8745340..328bdda8e1 100644
--- a/modules/scheduler/pre-existing-gke-cluster/versions.tf
+++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf
@@ -23,7 +23,7 @@ terraform {
   }
 
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.41.0"
   }
 
   required_version = ">= 1.3"
diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf
index 0d44be1243..c954c7e6fa 100644
--- a/modules/scripts/startup-script/versions.tf
+++ b/modules/scripts/startup-script/versions.tf
@@ -30,7 +30,7 @@ terraform {
     }
   }
   provider_meta "google" {
-    module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.40.0"
+    module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.41.0"
   }
 
   required_version = ">= 1.3"