From c2ab706524e6a319ed94e4e75f9049e1e7db17db Mon Sep 17 00:00:00 2001 From: jrossthomson Date: Thu, 12 Sep 2024 17:53:25 -0400 Subject: [PATCH 001/102] Added documentation on cloud-ops-agent installation and stackdriver removal --- modules/scripts/startup-script/README.md | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index dbfb3f8e4c..b9cae7fdee 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -141,6 +141,8 @@ better performance under some HPC workloads. While official documentation recommends using the _Cloud Ops Agent_, it is recommended to use `install_stackdriver_agent` when performance is important. +#### Stackdriver Agent Installation + If an image or machine already has Cloud Ops Agent installed and you would like to instead use the Stackdrier Agent, the following script will remove the Cloud Ops Agent and install the Stackdriver Agent. @@ -160,6 +162,33 @@ curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh sudo bash add-logging-agent-repo.sh --also-install sudo service stackdriver-agent start ``` +#### Cloud Ops Agent Installation + +If an image or machine already has the Stackdriver Agent installed and you would +like to instead use the Cloud Ops Agent, the following script will remove the +Stackdriver Agent and install the Cloud Ops Agent. + +```bash +# UnInstall Stackdriver Agent + +sudo systemctl stop stackdriver-agent.service +sudo systemctl disable stackdriver-agent.service +curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh +sudo dpkg --configure -a +sudo bash add-monitoring-agent-repo.sh --uninstall +sudo bash add-monitoring-agent-repo.sh --remove-repo + +# Install ops-agent + +curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh +sudo bash add-google-cloud-ops-agent-repo.sh --also-install +sudo service google-cloud-ops-agent start +``` + +As a reminder, this should be in a startup script, which should run on all +Compute nodes via the `compute_startup_script` on the controller. + +#### Testing Installation You can test if one of the agents is running using the following commands: From 71c5b497af0dfc4f2594eb638e219d62b1a63a5e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 12 Sep 2024 23:20:49 +0000 Subject: [PATCH 002/102] SlurmGCP. Refactor reservation fetching logic --- .../modules/slurm_files/scripts/resume.py | 31 ++++------- .../modules/slurm_files/scripts/util.py | 51 ++++++++++++------- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 7856a5cada..4426d402a4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -73,43 +73,30 @@ def instance_properties(nodeset, model, placement_group, labels=None): props.disks = template_info.disks if placement_group: - props.scheduling = { - "onHostMaintenance": "TERMINATE", - } + props.scheduling.onHostMaintenance = "TERMINATE" props.resourcePolicies = [placement_group] - if nodeset.reservation_name: - reservation_name = nodeset.reservation_name - - zones = list(nodeset.zone_policy_allow or []) - assert len(zones) == 1, "Only single zone is supported if using a reservation" - - reservation = lookup().reservation(reservation_name, zones[0]) - + if reservation := lookup().nodeset_reservation(nodeset): props.reservationAffinity = { "consumeReservationType": "SPECIFIC_RESERVATION", "key": f"compute.{util.universe_domain()}/reservation-name", - "values": [reservation_name], + "values": [reservation.bulk_insert_name], } - policies = util.reservation_resource_policies(reservation) - if policies: - props.scheduling = { - "onHostMaintenance": "TERMINATE", - } - props.resourcePolicies = policies + if reservation.policies: + props.scheduling.onHostMaintenance = "TERMINATE" + props.resourcePolicies = reservation.policies log.info( - f"reservation {reservation_name} is being used with policies {props.resourcePolicies}" + f"reservation {reservation.bulk_insert_name} is being used with policies {props.resourcePolicies}" ) else: props.resourcePolicies = [] log.info( - f"reservation {reservation_name} is being used without any policies" + f"reservation {reservation.bulk_insert_name} is being used without any policies" ) if nodeset.maintenance_interval: - props.scheduling = props.scheduling or {} - props.scheduling["maintenanceInterval"] = nodeset.maintenance_interval + props.scheduling.maintenanceInterval = nodeset.maintenance_interval # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index cb17500d90..eaf455e8dd 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -18,6 +18,7 @@ import argparse import base64 import collections +from dataclasses import dataclass import hashlib import inspect import json @@ -346,16 +347,6 @@ def install_custom_scripts(check_hash=False): blob.download_to_file(f) chown_slurm(fullpath, mode=0o755) - -def reservation_resource_policies(reservation): - """ - Inspects reservation object, returns list of resource policies names. - Converts policy URLs to names, e.g.: - projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra - """ - return [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] - - def compute_service(version="beta"): """Make thread-safe compute service handle creates a new Http for each request @@ -1452,6 +1443,13 @@ def delete_node(self, nodename): return True +@dataclass(frozen=True) +class ReservationDetails: + project: str + zone: str + policies: List[str] # names (not URLs) of resource policies + bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name) + class Lookup: """Wrapper class for cached data access""" @@ -1743,20 +1741,37 @@ def instance(self, instance_name: str) -> Optional[object]: return self.instances().get(instance_name) @lru_cache() - def reservation(self, name: str, zone: str) -> object: + def _get_reservation(self, project: str, zone: str, name: str) -> object: """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations""" + return self.compute.reservations().get( + project=project, zone=zone, reservation=name).execute() + + def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: + if not nodeset.reservation_name: + return None + + zones = list(nodeset.zone_policy_allow or []) + assert len(zones) == 1, "Only single zone is supported if using a reservation" + zone = zones[0] + try: - _, project, _, short_name = name.split("/") + _, project, _, name = nodeset.reservation_name.split("/") except ValueError: raise ValueError( - f"Invalid reservation name: '{name}', expected format is 'projects/PROJECT/reservations/NAME'" + f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" ) + + reservation = self._get_reservation(project, zone, name) - return ( - self.compute.reservations() - .get(project=project, zone=zone, reservation=short_name) - .execute() - ) + # Converts policy URLs to names, e.g.: + # projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra + policies = [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] + + return ReservationDetails( + project=project, + zone=zone, + policies=policies, + bulk_insert_name=nodeset.reservation_name) @lru_cache(maxsize=1) def machine_types(self): From 2f9667f939efcee183a98608d73413e450432936 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 21 Sep 2024 04:53:14 +0000 Subject: [PATCH 003/102] Extract `cty.Value` YAML logic from `Dict`. --- pkg/config/yaml.go | 59 +++++++++++++++++++++++++---------------- pkg/config/yaml_test.go | 19 ++++++++++--- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index c23a7bd6ce..855aa8f3db 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -313,29 +313,9 @@ func (y *YamlValue) unmarshalTuple(n *yaml.Node) error { return nil } -// UnmarshalYAML implements custom YAML unmarshaling. -func (d *Dict) UnmarshalYAML(n *yaml.Node) error { - var v YamlValue - if err := n.Decode(&v); err != nil { - return err - } - ty := v.Unwrap().Type() - if !ty.IsObjectType() { - return nodeToPosErr(n, fmt.Errorf("must be a mapping, got %s", ty.FriendlyName())) - } - - for k, w := range v.Unwrap().AsValueMap() { - if d.m == nil { - d.m = map[string]cty.Value{} - } - d.m[k] = w - } - return nil -} - // MarshalYAML implements custom YAML marshaling. -func (d Dict) MarshalYAML() (interface{}, error) { - o, _ := cty.Transform(d.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { +func (y YamlValue) MarshalYAML() (interface{}, error) { + m, err := cty.Transform(y.Unwrap(), func(p cty.Path, v cty.Value) (cty.Value, error) { if v.IsNull() { return v, nil } @@ -358,7 +338,11 @@ func (d Dict) MarshalYAML() (interface{}, error) { return v, nil }) - j := ctyJson.SimpleJSONValue{Value: o} + if err != nil { + return nil, err + } + + j := ctyJson.SimpleJSONValue{Value: m} b, err := j.MarshalJSON() if err != nil { return nil, fmt.Errorf("failed to marshal JSON: %v", err) @@ -371,6 +355,35 @@ func (d Dict) MarshalYAML() (interface{}, error) { return g, nil } +// UnmarshalYAML implements custom YAML unmarshaling. +func (d *Dict) UnmarshalYAML(n *yaml.Node) error { + var vm map[string]YamlValue + if err := n.Decode(&vm); err != nil { + return err + } + + for k, v := range vm { + if d.m == nil { + d.m = map[string]cty.Value{} + } + d.m[k] = v.Unwrap() + } + return nil +} + +// MarshalYAML implements custom YAML marshaling. +func (d Dict) MarshalYAML() (interface{}, error) { + m := map[string]interface{}{} + for k, v := range d.m { + y, err := YamlValue{v}.MarshalYAML() + if err != nil { + return nil, err + } + m[k] = y + } + return m, nil +} + // yaml.v3 errors are either TypeError - collection of error message or single error message. // Parse error messages to extract short error message and position. func parseYamlV3Error(err error) error { diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go index a6ae3cc6d7..fabad8ca24 100644 --- a/pkg/config/yaml_test.go +++ b/pkg/config/yaml_test.go @@ -218,6 +218,7 @@ func TestDictUnmarshalYAML(t *testing.T) { yml := ` s1: "red" s2: pink +nl: m1: {} m2: m2f1: green @@ -229,6 +230,7 @@ m2: want := Dict{}. With("s1", cty.StringVal("red")). With("s2", cty.StringVal("pink")). + With("nl", cty.NullVal(cty.DynamicPseudoType)). With("m1", cty.EmptyObjectVal). With("m2", cty.ObjectVal(map[string]cty.Value{ "m2f1": cty.StringVal("green"), @@ -259,14 +261,12 @@ func TestDictWrongTypeUnmarshalYAML(t *testing.T) { if err == nil { t.Errorf("expected error, got nil") } - if diff := cmp.Diff(err.Error(), "line 2 column 1: must be a mapping, got number"); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } } func TestDictMarshalYAML(t *testing.T) { d := Dict{}. With("s1", cty.StringVal("red")). + With("nl", cty.NullVal(cty.DynamicPseudoType)). With("m1", cty.EmptyObjectVal). With("m2", cty.ObjectVal(map[string]cty.Value{ "m2f1": cty.StringVal("green"), @@ -280,6 +280,7 @@ func TestDictMarshalYAML(t *testing.T) { })) want := map[string]interface{}{ "s1": "red", + "nl": nil, "m1": map[string]interface{}{}, "m2": map[string]interface{}{ "m2f1": "green", @@ -295,6 +296,18 @@ func TestDictMarshalYAML(t *testing.T) { } } +func TestEmptyDictMarshalYAML(t *testing.T) { + d := Dict{} + want := map[string]interface{}{} + got, err := d.MarshalYAML() + if err != nil { + t.Fatalf("failed to marshal: %v", err) + } + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } +} + func TestYAMLValueMarshalIntAsInt(t *testing.T) { d := Dict{}.With("zebra", cty.NumberIntVal(5)) want := "zebra: 5\n" From 2514f670bddb6a9a940a210e8911d5e6f7866abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Sat, 21 Sep 2024 12:44:15 +0000 Subject: [PATCH 004/102] Add explicit project references --- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf | 1 + community/modules/network/private-service-access/README.md | 1 + community/modules/network/private-service-access/main.tf | 1 + .../modules/network/private-service-access/variables.tf | 5 +++++ modules/file-system/parallelstore/main.tf | 1 + 5 files changed, 9 insertions(+) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index e9ef538f67..224ca76f80 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -174,6 +174,7 @@ data "google_compute_reservation" "reservation" { data "google_compute_machine_types" "machine_types_by_zone" { for_each = local.zones + project = var.project_id filter = format("name = \"%s\"", var.machine_type) zone = each.value } diff --git a/community/modules/network/private-service-access/README.md b/community/modules/network/private-service-access/README.md index 50fbd42235..52ef2205c5 100644 --- a/community/modules/network/private-service-access/README.md +++ b/community/modules/network/private-service-access/README.md @@ -82,6 +82,7 @@ No modules. | [labels](#input\_labels) | Labels to add to supporting resources. Key-value pairs. | `map(string)` | n/a | yes | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to configure private service Access.:
`projects//global/networks/`" | `string` | n/a | yes | | [prefix\_length](#input\_prefix\_length) | The prefix length of the IP range allocated for the private service access. | `number` | `16` | no | +| [project\_id](#input\_project\_id) | ID of project in which Private Service Access will be created. | `string` | n/a | yes | ## Outputs diff --git a/community/modules/network/private-service-access/main.tf b/community/modules/network/private-service-access/main.tf index 706fe1cdf7..4bb54821c3 100644 --- a/community/modules/network/private-service-access/main.tf +++ b/community/modules/network/private-service-access/main.tf @@ -26,6 +26,7 @@ resource "random_id" "resource_name_suffix" { resource "google_compute_global_address" "private_ip_alloc" { provider = google-beta name = "global-psconnect-ip-${random_id.resource_name_suffix.hex}" + project = var.project_id purpose = "VPC_PEERING" address_type = "INTERNAL" network = var.network_id diff --git a/community/modules/network/private-service-access/variables.tf b/community/modules/network/private-service-access/variables.tf index e600463e3e..18b73ac2d9 100644 --- a/community/modules/network/private-service-access/variables.tf +++ b/community/modules/network/private-service-access/variables.tf @@ -40,3 +40,8 @@ variable "prefix_length" { type = number default = 16 } + +variable "project_id" { + description = "ID of project in which Private Service Access will be created." + type = string +} diff --git a/modules/file-system/parallelstore/main.tf b/modules/file-system/parallelstore/main.tf index 56a4069342..3de3b94f3a 100644 --- a/modules/file-system/parallelstore/main.tf +++ b/modules/file-system/parallelstore/main.tf @@ -46,6 +46,7 @@ resource "random_id" "resource_name_suffix" { } resource "google_parallelstore_instance" "instance" { + project = var.project_id instance_id = local.id location = var.zone capacity_gib = var.size_gb From 6fd3bc3698d6f13f26959a8c95598ef6ddc31ac2 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 25 Sep 2024 18:26:00 +0000 Subject: [PATCH 005/102] Update reservation for maintenance document for API support details --- .../schedmd-slurm-gcp-v6-controller/README.md | 11 +++++++++++ modules/compute/gke-node-pool/README.md | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index f9f3edc6b2..b9cb9d6d95 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -105,6 +105,17 @@ run the job outside of the maintenance window. srun -n1 -pcompute -t 10:00 ``` +Currently upcoming maintenance notification is supported in ALPHA version of +compute API. You can update the API version from your blueprint, + +```yaml + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + settings: + endpoint_versions: + compute: "alpha" +``` + ## Placement Max Distance When using diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 1ba0757eee..fcf7414af6 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -29,7 +29,7 @@ can be overridden using the `taints` setting. See more info. ### Local SSD Storage -GKE offers two options for managing locally attached SSDs. +GKE offers two options for managing locally attached SSDs. The first, and recommended, option is for GKE to manage the ephemeral storage space on the node, which will then be automatically attached to pods which From c8b0c00b9a7020de768006415196a1f49ccc3f0c Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 23 Sep 2024 21:57:32 +0000 Subject: [PATCH 006/102] Changed exact number to minimum for additional vpcs in gpu_direct --- modules/compute/gke-node-pool/README.md | 20 ++++++++++---------- modules/compute/gke-node-pool/gpu_direct.tf | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fcf7414af6..38815c06dd 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -294,26 +294,26 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index e72d85fd3c..774f1ad12b 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -30,8 +30,9 @@ locals { "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin ] - updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") - rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 + updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") + rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 + required_additional_networks = 4 } "a3-megagpu-8g" = { # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines @@ -39,19 +40,18 @@ locals { "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml", # nccl_plugin v1.0.4 for tcpxo "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin ] - updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") - rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 + updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") + rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 + required_additional_networks = 8 } } + + min_additional_networks = try(local.gpu_direct_setting[var.machine_type].min_additional_networks, 0) } check "gpu_direct_check_multi_vpc" { assert { - condition = !(var.machine_type == "a3-highgpu-8g" && length(var.additional_networks) != 4) - error_message = "To achieve optimal performance for ${var.machine_type} machine, 4 additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as 4" - } - assert { - condition = !(var.machine_type == "a3-megagpu-8g" && length(var.additional_networks) != 8) - error_message = "To achieve optimal performance for ${var.machine_type} machine, 8 additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as 8" + condition = length(var.additional_networks) >= local.min_additional_networks + error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}" } } From 9b3a60518b07a84fb4ce8cb6a6423683d3eb3d90 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 23 Sep 2024 23:59:34 +0000 Subject: [PATCH 007/102] Revert auto-updated doc --- modules/compute/gke-node-pool/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 38815c06dd..fcf7414af6 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -294,26 +294,26 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | From 7688f49671c066832b9382478eea63048e829cf0 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 26 Sep 2024 13:17:12 +0000 Subject: [PATCH 008/102] property name fixed --- modules/compute/gke-node-pool/gpu_direct.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 774f1ad12b..45ab72e6e0 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -46,7 +46,7 @@ locals { } } - min_additional_networks = try(local.gpu_direct_setting[var.machine_type].min_additional_networks, 0) + min_additional_networks = try(local.gpu_direct_setting[var.machine_type].required_additional_networks, 0) } check "gpu_direct_check_multi_vpc" { From c627c559a606b3684df37cc4132643f3eeba484b Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 26 Sep 2024 13:18:50 +0000 Subject: [PATCH 009/102] object name fixed --- modules/compute/gke-node-pool/gpu_direct.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 45ab72e6e0..fd266c7754 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -46,7 +46,7 @@ locals { } } - min_additional_networks = try(local.gpu_direct_setting[var.machine_type].required_additional_networks, 0) + min_additional_networks = try(local.gpu_direct_settings[var.machine_type].required_additional_networks, 0) } check "gpu_direct_check_multi_vpc" { From 62b5c803a8f78403e2e4c7b93d21c87ed3060731 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 08:18:15 +0000 Subject: [PATCH 010/102] variable name updated --- modules/compute/gke-node-pool/gpu_direct.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index fd266c7754..23c370edf2 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -32,7 +32,7 @@ locals { ] updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 - required_additional_networks = 4 + min_additional_networks = 4 } "a3-megagpu-8g" = { # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines @@ -42,11 +42,11 @@ locals { ] updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 - required_additional_networks = 8 + min_additional_networks = 8 } } - min_additional_networks = try(local.gpu_direct_settings[var.machine_type].required_additional_networks, 0) + min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0) } check "gpu_direct_check_multi_vpc" { From 362dbb939bbe50a02cfb6565639a1df02570b201 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 10:18:29 +0000 Subject: [PATCH 011/102] style modification --- modules/compute/gke-node-pool/gpu_direct.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 23c370edf2..b22c353f69 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -30,8 +30,8 @@ locals { "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpx/nccl-config.yaml", # nccl_configmap "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin ] - updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") - rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 + updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") + rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 min_additional_networks = 4 } "a3-megagpu-8g" = { @@ -40,8 +40,8 @@ locals { "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/gpudirect-tcpxo/nccl-tcpxo-installer.yaml", # nccl_plugin v1.0.4 for tcpxo "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/fee883360a660f71ba07478db95d5c1325322f77/nri_device_injector/nri-device-injector.yaml", # nri_plugin ] - updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") - rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 + updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") + rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 min_additional_networks = 8 } } From 07d04bafa778174bc126352bfea7873006ca4d0d Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Mon, 23 Sep 2024 22:16:13 +0000 Subject: [PATCH 012/102] resource-policy module implemented --- modules/README.md | 2 + .../resource-policy/resource-policy/README.md | 59 +++++++++++++++++++ .../resource-policy/resource-policy/main.tf | 31 ++++++++++ .../resource-policy/metadata.yaml | 19 ++++++ .../resource-policy/outputs.tf | 28 +++++++++ .../resource-policy/variables.tf | 40 +++++++++++++ .../resource-policy/versions.tf | 30 ++++++++++ 7 files changed, 209 insertions(+) create mode 100644 modules/compute/resource-policy/resource-policy/README.md create mode 100644 modules/compute/resource-policy/resource-policy/main.tf create mode 100644 modules/compute/resource-policy/resource-policy/metadata.yaml create mode 100644 modules/compute/resource-policy/resource-policy/outputs.tf create mode 100644 modules/compute/resource-policy/resource-policy/variables.tf create mode 100644 modules/compute/resource-policy/resource-policy/versions.tf diff --git a/modules/README.md b/modules/README.md index d9ba636393..defba11446 100644 --- a/modules/README.md +++ b/modules/README.md @@ -49,6 +49,7 @@ Modules that are still in development and less stable are labeled with the Creates a dynamic nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module and instance template. * **[gke-node-pool]** ![core-badge] ![experimental-badge] : Creates a Kubernetes node pool using GKE. +* **[resource-policy]** ![core-badge] ![experimental-badge] : Create a resource policy for compute engines that can be applied to gke-node-pool's nodes. * **[gke-job-template]** ![core-badge] ![experimental-badge] : Creates a Kubernetes job file to be used with a [gke-node-pool]. * **[htcondor-execute-point]** ![community-badge] ![experimental-badge] : @@ -62,6 +63,7 @@ Modules that are still in development and less stable are labeled with the [vm-instance]: compute/vm-instance/README.md [gke-node-pool]: ../modules/compute/gke-node-pool/README.md +[resource-policy]: ../modules/compute/resource-policy/README.md [gke-job-template]: ../modules/compute/gke-job-template/README.md [schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md new file mode 100644 index 0000000000..7f558f9696 --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/README.md @@ -0,0 +1,59 @@ +## Description + +This modules create a resource policy for compute engines. This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes. + +Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your Technical Account Manager (TAM) or the Sales team. + + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google-beta](#requirement\_google-beta) | ~> 5.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google-beta](#provider\_google-beta) | ~> 5.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_compute_resource_policy.policy](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_resource_policy) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no | +| [name](#input\_name) | The resource policy's name. | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The project ID for the resource policy. | `string` | n/a | yes | +| [region](#input\_region) | The region for the the resource policy. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | + diff --git a/modules/compute/resource-policy/resource-policy/main.tf b/modules/compute/resource-policy/resource-policy/main.tf new file mode 100644 index 0000000000..5adce37f0f --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/main.tf @@ -0,0 +1,31 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +resource "google_compute_resource_policy" "policy" { + name = var.name + region = var.region + project = var.project_id + provider = google-beta + + dynamic "group_placement_policy" { + for_each = var.group_placement_max_distance > 0 ? [1] : [] + + content { + collocation = "COLLOCATED" + max_distance = var.group_placement_max_distance + } + } +} diff --git a/modules/compute/resource-policy/resource-policy/metadata.yaml b/modules/compute/resource-policy/resource-policy/metadata.yaml new file mode 100644 index 0000000000..4c2f23a8d7 --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/metadata.yaml @@ -0,0 +1,19 @@ +# Copyright 2023 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - compute.googleapis.com diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/resource-policy/outputs.tf new file mode 100644 index 0000000000..78872433d8 --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/outputs.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "placement_policy" { + description = <<-EOT + Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. + It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. + Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. + EOT + + value = { + type = var.group_placement_max_distance > 0 ? "COMPACT" : null + name = var.group_placement_max_distance > 0 ? var.name : null + } +} diff --git a/modules/compute/resource-policy/resource-policy/variables.tf b/modules/compute/resource-policy/resource-policy/variables.tf new file mode 100644 index 0000000000..b2841394d1 --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/variables.tf @@ -0,0 +1,40 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "The project ID for the resource policy." + type = string +} + +variable "region" { + description = "The region for the the resource policy." + type = string +} + +variable "name" { + description = "The resource policy's name." + type = string +} + +variable "group_placement_max_distance" { + description = <<-EOT + The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy. + Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. + EOT + + type = number + default = 0 +} diff --git a/modules/compute/resource-policy/resource-policy/versions.tf b/modules/compute/resource-policy/resource-policy/versions.tf new file mode 100644 index 0000000000..4b7b6158c9 --- /dev/null +++ b/modules/compute/resource-policy/resource-policy/versions.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "~> 5.0" + } + } + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:resource-policy/v1.37.2" + } + + required_version = ">= 1.3" +} From 688e8ee697afdb21189cf5fdcab7175d6596e8a0 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 24 Sep 2024 00:22:33 +0000 Subject: [PATCH 013/102] fix doc br tags --- modules/compute/resource-policy/resource-policy/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md index 7f558f9696..1a4bf79823 100644 --- a/modules/compute/resource-policy/resource-policy/README.md +++ b/modules/compute/resource-policy/resource-policy/README.md @@ -46,7 +46,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no | +| [group\_placement\_max\_distance](#input\_group\_placement\_max\_distance) | The max distance for group placement policy to use for the node pool's nodes. If set it will add a compact group placement policy.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | `number` | `0` | no | | [name](#input\_name) | The resource policy's name. | `string` | n/a | yes | | [project\_id](#input\_project\_id) | The project ID for the resource policy. | `string` | n/a | yes | | [region](#input\_region) | The region for the the resource policy. | `string` | n/a | yes | @@ -55,5 +55,5 @@ No modules. | Name | Description | |------|-------------| -| [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | +| [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | From 7759b51e97171c6d555466ac5116056b857f873b Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 26 Sep 2024 14:05:55 +0000 Subject: [PATCH 014/102] module doc updated --- .../resource-policy/resource-policy/README.md | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md index 1a4bf79823..e1c5222adb 100644 --- a/modules/compute/resource-policy/resource-policy/README.md +++ b/modules/compute/resource-policy/resource-policy/README.md @@ -1,8 +1,28 @@ ## Description -This modules create a resource policy for compute engines. This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes. +This modules create a [resource policy for compute engines](https://cloud.google.com/compute/docs/instances/placement-policies-overview). This policy can be passed to a gke-node-pool module to apply the policy on the node-pool's nodes. + +Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your [Technical Account Manager (TAM)](https://cloud.google.com/tam) or the [Sales team](https://cloud.google.com/contact). + +### Example + +The following example creates a group placement resource policy and applies it to a gke-node-pool. + +```yaml + - id: group_placement_1 + source: modules/compute/resource-policy + settings: + name: gp-np-1 + group_placement_max_distance: 2 + + - id: node_pool_1 + source: modules/compute/gke-node-pool + use: [group_placement_1] + settings: + machine_type: e2-standard-8 + outputs: [instructions] +``` -Note: By default, you can't apply compact placement policies with a max distance value to A3 VMs. To request access to this feature, contact your Technical Account Manager (TAM) or the Sales team. Copyright 2024 Google LLC From a6701560a7efac5797c933714ffd3ee63c5aca69 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 26 Sep 2024 14:08:15 +0000 Subject: [PATCH 015/102] minor doc updated --- modules/compute/resource-policy/resource-policy/README.md | 2 +- modules/compute/resource-policy/resource-policy/outputs.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md index e1c5222adb..d01fc6f944 100644 --- a/modules/compute/resource-policy/resource-policy/README.md +++ b/modules/compute/resource-policy/resource-policy/README.md @@ -75,5 +75,5 @@ No modules. | Name | Description | |------|-------------| -| [placement\_policy](#output\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | +| [placement\_policy](#output\_placement\_policy) | Group placement policy to use for placing VMs or GKE nodes placement. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. | diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/resource-policy/outputs.tf index 78872433d8..64e4275c91 100644 --- a/modules/compute/resource-policy/resource-policy/outputs.tf +++ b/modules/compute/resource-policy/resource-policy/outputs.tf @@ -16,7 +16,7 @@ output "placement_policy" { description = <<-EOT - Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. + Group placement policy to use for placing VMs or GKE nodes placement. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. EOT From a1836bbb957880ee48daadab2f2f4084489bd881 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 11:21:11 +0000 Subject: [PATCH 016/102] style fix --- modules/compute/resource-policy/resource-policy/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/resource-policy/README.md index d01fc6f944..f3f00e3437 100644 --- a/modules/compute/resource-policy/resource-policy/README.md +++ b/modules/compute/resource-policy/resource-policy/README.md @@ -23,7 +23,6 @@ The following example creates a group placement resource policy and applies it t outputs: [instructions] ``` - Copyright 2024 Google LLC From 85e3ce834297860abf0faeba0231e3a27e4cd2d0 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 11:52:11 +0000 Subject: [PATCH 017/102] Added compatibility check for GPUDirect and GKE version --- modules/compute/gke-node-pool/README.md | 1 + modules/compute/gke-node-pool/gpu_direct.tf | 28 +++++++++++++++++++ modules/compute/gke-node-pool/variables.tf | 5 ++++ modules/scheduler/gke-cluster/README.md | 1 + modules/scheduler/gke-cluster/outputs.tf | 5 ++++ .../pre-existing-gke-cluster/README.md | 1 + .../pre-existing-gke-cluster/outputs.tf | 5 ++++ 7 files changed, 46 insertions(+) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fcf7414af6..7b1cffbf68 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -294,6 +294,7 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | +| [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index b22c353f69..4fef57e914 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -33,6 +33,12 @@ locals { updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 min_additional_networks = 4 + min_gke_versions = { + "1.27" = "1.27.7-gke.1121000" + "1.28" = "1.28.8-gke.1095000" + "1.29" = "1.29.3-gke.1093000" + "1.30" = "1.30.2-gke.1023000" + } } "a3-megagpu-8g" = { # Manifest to be installed for enabling TCPXO on a3-megagpu-8g machines @@ -43,10 +49,25 @@ locals { updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 min_additional_networks = 8 + min_gke_versions = { + "1.28" = "1.28.9-gke.1250000" + "1.29" = "1.29.4-gke.1542000" + "1.30" = "1.30.4-gke.1129000" + } } } min_additional_networks = try(local.gpu_direct_settings[var.machine_type].min_additional_networks, 0) + + gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"] + + gke_version_parts = regex(local.gke_version_regex, var.gke_master_version) + gke_version_major = local.gke_version_parts[0] + + min_gke_versions = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null) + min_version = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0" + min_version_parts = regex(local.gke_version_regex, local.min_version) + gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2]) } check "gpu_direct_check_multi_vpc" { @@ -55,3 +76,10 @@ check "gpu_direct_check_multi_vpc" { error_message = "To achieve optimal performance for ${var.machine_type} machine, at least ${local.min_additional_networks} additional vpc is recommended. You could configure it in the blueprint through modules/network/multivpc with network_count set as ${local.min_additional_networks}" } } + +check "gke_master_version_requirements" { + assert { + condition = local.gke_gpudirect_compatible + error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + } +} diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index ef1277744f..62160a2448 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -360,3 +360,8 @@ variable "initial_node_count" { type = number default = null } + +variable "gke_master_version" { + description = "GKE master version" + type = string +} diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 3a72e1149b..4548db2fc9 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -194,6 +194,7 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. | +| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | | [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. | | [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. | diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf index 53ee068ca2..4daed8ee25 100644 --- a/modules/scheduler/gke-cluster/outputs.tf +++ b/modules/scheduler/gke-cluster/outputs.tf @@ -74,3 +74,8 @@ output "k8s_service_account_name" { description = "Name of k8s service account." value = one(module.workload_identity[*].k8s_service_account_name) } + +output "gke_master_version" { + description = "GKE cluster's master version." + value = google_container_cluster.gke_cluster.master_version +} diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 519715480d..1f2904d889 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -111,4 +111,5 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | +| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf index 9bfd571b61..90772d3dae 100644 --- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf +++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -26,3 +26,8 @@ output "gke_cluster_exists" { data.google_container_cluster.existing_gke_cluster ] } + +output "gke_master_version" { + description = "GKE cluster's master version." + value = data.google_container_cluster.existing_gke_cluster.master_version +} From 2709bde8b252959f1768e6074c9f1aca2fd68a97 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 13:29:52 +0000 Subject: [PATCH 018/102] gke-topology-scheduler module implemented --- .../compute/gke-topology-scheduler/README.md | 54 ++ .../compute/gke-topology-scheduler/main.tf | 27 + .../manifests/label-nodes-daemon.yaml | 49 ++ .../manifests/schedule-daemon.yaml | 48 ++ .../manifests/service-account.yaml | 47 ++ .../manifests/topology-scheduler-scripts.yaml | 546 ++++++++++++++++++ .../gke-topology-scheduler/variables.tf | 23 + .../gke-topology-scheduler/versions.tf | 21 + 8 files changed, 815 insertions(+) create mode 100644 community/modules/compute/gke-topology-scheduler/README.md create mode 100644 community/modules/compute/gke-topology-scheduler/main.tf create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml create mode 100644 community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml create mode 100644 community/modules/compute/gke-topology-scheduler/variables.tf create mode 100644 community/modules/compute/gke-topology-scheduler/versions.tf diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md new file mode 100644 index 0000000000..ad4ea32cbd --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/README.md @@ -0,0 +1,54 @@ +## Description + +This module enables topology on a Google Kubernetes Engine cluster. +This is implemented based on sources and instructions explained [here](https://github.com/GoogleCloudPlatform/container-engine-accelerators/tree/master/gpudirect-tcpxo/topology-scheduler). + +## Prerequisites + +For topology awareness to be enabled, a GKE node pool has to be created with +compact placement. Specifically, the `physical_host` attribute +[ref](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location) +should be present for each GPU node in the cluster. + +### Example + +The following example installs topology scheduler on a GKE cluster. + +```yaml + - id: topology_aware_scheduler_install + source: community/modules/compute/gke-topology-scheduler + use: [gke_cluster] +``` + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [kubectl\_apply](#module\_kubectl\_apply) | ../../../../modules/management/kubectl-apply | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/community/modules/compute/gke-topology-scheduler/main.tf b/community/modules/compute/gke-topology-scheduler/main.tf new file mode 100644 index 0000000000..677595632b --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/main.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module "kubectl_apply" { + source = "../../../../modules/management/kubectl-apply" + + cluster_id = var.cluster_id + project_id = var.project_id + + apply_manifests = [ + { source = "${path.module}/manifests/topology-scheduler-scripts.yaml" }, + { source = "${path.module}/manifests/service-account.yaml" }, + { source = "${path.module}/manifests/label-nodes-daemon.yaml" }, + { source = "${path.module}/manifests/schedule-daemon.yaml" } + ] +} diff --git a/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml new file mode 100644 index 0000000000..fe49c607a6 --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/manifests/label-nodes-daemon.yaml @@ -0,0 +1,49 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: label-nodes-daemon + namespace: kube-system +spec: + selector: + matchLabels: + name: label-nodes-daemon + template: + metadata: + labels: + name: label-nodes-daemon + spec: + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + hostNetwork: true + containers: + - name: label-nodes-daemon + image: python:3.9 + command: + - bash + - -c + - | + pip install kubernetes + python -u /scripts/label-nodes-daemon.py + volumeMounts: + - name: scripts-volume + mountPath: /scripts + volumes: + - name: scripts-volume + configMap: + name: topology-scheduler-scripts + serviceAccount: topology-scheduler diff --git a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml new file mode 100644 index 0000000000..b412f936e9 --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml @@ -0,0 +1,48 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: topology-scheduler + labels: + app: topology-scheduler + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: topology-scheduler + template: + metadata: + labels: + app: topology-scheduler + spec: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + containers: + - name: topology-scheduler-container + image: python:3.9 + command: ["/bin/sh", "-c", "pip install google-auth google-api-python-client kubernetes; python /scripts/schedule-daemon.py --ignored-namespace kube-system gmp-public gmp-system"] + volumeMounts: + - name: scripts-volume + mountPath: /scripts + volumes: + - name: scripts-volume + configMap: + name: topology-scheduler-scripts + serviceAccount: topology-scheduler + restartPolicy: Always diff --git a/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml b/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml new file mode 100644 index 0000000000..61834ced8f --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/manifests/service-account.yaml @@ -0,0 +1,47 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: topology-scheduler + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: topology-scheduler +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list", "update", "patch"] +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: topology-scheduler +subjects: +- kind: ServiceAccount + name: topology-scheduler + namespace: kube-system +roleRef: + kind: ClusterRole + name: topology-scheduler + apiGroup: rbac.authorization.k8s.io diff --git a/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml new file mode 100644 index 0000000000..96b4a89b34 --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/manifests/topology-scheduler-scripts.yaml @@ -0,0 +1,546 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: topology-scheduler-scripts + namespace: kube-system +data: + schedule-daemon.py: | + #!/usr/bin/env python + + # Copyright 2024 Google Inc. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + import argparse + from itertools import groupby + import time + import kubernetes + import kubernetes.client + from kubernetes.client.rest import ApiException + from kubernetes.utils.quantity import parse_quantity + + + def split_pods_based_on_jobs(pods): + """Splits pending pods into groups based on jobs.""" + return [ + list(job_group) + for _, job_group in groupby(pods, lambda pod: pod.get('job_name')) + ] + + + def sort_jobs_by_time(job): + """Return the key to be used for sorting jobs which is by creation time.""" + # All the pods in the job should have the same creation time. + return job[0].get('creation_time') + + + def pod_sorting_key(pod): + """Returns key to be used for sorting pods. + Given that numbers is often suffixed for multi-node deployments, + here we use a (prefix, number) tuple for the sorting key. + This means "xxx-pod2" should appear before "xxx-pod10" + """ + + if pod['index'] is not None: + return int(pod['index']) + + # if the suffix is a number, extract it + idx = 0 + suffix = "" + name = pod['name'] + while name[-1 - len(suffix)].isdigit(): + suffix = name[-1 - len(suffix)] + suffix + + if suffix != "": + idx = int(suffix) + + return (name[:len(name) - len(suffix)], idx) + + + def node_topology_distance(node1, node2): + node1_key = node_topology_key(node1) + node2_key = node_topology_key(node2) + result = 1000000 + for i in range(len(node1_key)): + if node1_key[i] != node2_key[i]: + return result + result /= 100 + return 0 + + + def node_topology_key(node): + """Builds a key to be used to sort nodes.""" + node_labels = node['node_labels'] + + if ( + 'cloud.google.com/gke-placement-group' in node_labels + and 'topology.gke.io/cluster' in node_labels + and 'topology.gke.io/rack' in node_labels + and 'topology.gke.io/host' in node_labels + ): + return ( + node_labels['cloud.google.com/gke-placement-group'], + node_labels['topology.gke.io/cluster'], + node_labels['topology.gke.io/rack'], + node_labels['topology.gke.io/host'], + ) + + return () + + + def get_pod_used_resources(pod): + """Get the resources used by this pod""" + used_cpu = 0 + used_memory = 0 + used_gpu = 0 + if pod.status is None or pod.status.container_statuses is None: + return used_cpu, used_memory, used_gpu + for container, container_status in zip(pod.spec.containers, pod.status.container_statuses): + if container_status.state.terminated is not None: + # terminated pods don't use resources + continue + requests = container.resources.requests or {} + used_cpu += parse_quantity(requests.get('cpu', 0)) + used_memory += parse_quantity(requests.get('memory', 0)) + used_gpu += int(requests.get('nvidia.com/gpu', 0)) + return used_cpu, used_memory, used_gpu + + + def get_pods_taint_toleration(pods): + """Get the taint tolerations of the pods. + For simplicity, we assume that the pods are homogeneous and + all have the same tolerations. + """ + ts = None + for pod in pods: + tolerations = pod['spec'].tolerations + if ts is None: + ts = tolerations + else: + assert(ts == tolerations) + return ts if ts is not None else [] + + + def find_schedulable_nodes(nodes, pods, tolerated_taints): + """Finds nodes that can be scheduled.""" + nodes_info = {} + + if tolerated_taints is not None: + tolerated_taint_dict = {t.key: t for t in tolerated_taints} + else: + tolerated_taint_dict = {} + + for node in nodes: + node_name = node.metadata.name + node_labels = node.metadata.labels + + if 'cloud.google.com/gke-placement-group' not in node_labels: + print( + f'Skipping node {node_name} because it does not have topology' + ' metadata' + ) + continue + + skip_node = False + if node.spec.taints is not None: + for t in node.spec.taints: + if t.key not in tolerated_taint_dict: + print(f'Skipping node {node_name} because it is tainted with key {t.key}') + skip_node = True + break + else: + tol = tolerated_taint_dict[t.key] + if tol.operator == "Equal" and tol.value != t.value: + skip_node = True + break + + if skip_node: + continue + + allocatable = node.status.allocatable + + used_cpu = 0 + used_memory = 0 + used_gpu = 0 + + for pod in pods: + if pod.spec.node_name == node_name: + cpu, mem, gpu = get_pod_used_resources(pod) + used_cpu += cpu + used_memory += mem + used_gpu += gpu + + free_cpu = parse_quantity(allocatable['cpu']) - used_cpu + free_memory = parse_quantity(allocatable['memory']) - used_memory + free_gpu = int(allocatable.get('nvidia.com/gpu', 0)) - used_gpu + + node_info = { + 'name': node_name, + 'cpu': free_cpu, + 'memory': free_memory, + 'gpu': free_gpu, + 'node_labels': node_labels, + } + nodes_info[node_name] = node_info + + print( + f'Node: {node_name}, CPU: {free_cpu}, Memory: {free_memory}, GPU:' + f' {free_gpu}, Topology: {node_topology_key(node_info)}' + ) + + return nodes_info + + + def find_pod_gates(pods, prefix): + """Finds pods with scheduling gates that starts with the prefix""" + s = set() + for pod in pods: + if pod.spec.scheduling_gates: + for g in pod.spec.scheduling_gates: + if g.name.startswith(prefix): + s.add(g.name) + return s + + + def find_schedulable_pods(pods, gate_name): + """Finds pods that can be scheduled.""" + pods_to_schedule = {} + + for pod in pods: + if pod.spec.scheduling_gates: + gates = pod.spec.scheduling_gates + for gate in gates: + if gate.name == gate_name: + pod_name = pod.metadata.name + pod_namespace = pod.metadata.namespace + + pod_index = None + job_name = None + if pod.metadata.labels is not None: + if ( + 'batch.kubernetes.io/job-completion-index' + in pod.metadata.labels + ): + pod_index = pod.metadata.labels[ + 'batch.kubernetes.io/job-completion-index' + ] + else: + print('Unable to find index in metadata. Can not queue jobs') + + if 'job-name' in pod.metadata.labels: + job_name = pod.metadata.labels['job-name'] + else: + print('Unable to find job_name in metadata. Can not queue jobs') + else: + print('No labels on pod to extract job metadata from.') + + creation_time = None + if pod.metadata.creation_timestamp is not None: + creation_time = pod.metadata.creation_timestamp + else: + print( + 'Unable to find creation_time in metadata. Can not queue jobs' + ) + + used_cpu = 0 + used_memory = 0 + used_gpu = 0 + + for container in pod.spec.containers: + requests = container.resources.requests or {} + used_cpu += parse_quantity(requests.get('cpu', 0)) + used_memory += parse_quantity(requests.get('memory', 0)) + used_gpu += int(requests.get('nvidia.com/gpu', 0)) + + pods_to_schedule[pod_name] = { + 'name': pod_name, + 'namespace': pod_namespace, + 'index': pod_index, + 'cpu': used_cpu, + 'memory': used_memory, + 'gpu': used_gpu, + 'node_selector': pod.spec.node_selector, + 'spec': pod.spec, + 'metadata': pod.metadata, + 'job_name': job_name, + 'creation_time': creation_time + } + + print( + f'Found schedulable pod: {pod_namespace}/{pod_name}, CPU:' + f' {used_cpu}, Memory: {used_memory}, GPU: {used_gpu}' + f' Index: {pod_index}' + ) + + return pods_to_schedule + + + def can_schedule(node, pod): + """Checks if a given pod can be scheduled on a given node.""" + node_selector = pod['node_selector'] + node_labels = node['node_labels'] + + if node_selector: + for key, value in node_selector.items(): + if key not in node_labels or node_labels[key] != value: + return False + + return ( + node['cpu'] >= pod['cpu'] + and node['memory'] >= pod['memory'] + and node['gpu'] >= pod['gpu'] + ) + + + def schedule_pod_on_node(v1, pod_name, pod_namespace, node_name, gate_name): + """Schedules a pod on a given node.""" + try: + pod = v1.read_namespaced_pod(pod_name, pod_namespace) + + if any(gate.name == gate_name for gate in pod.spec.scheduling_gates): + new_gates = [ + gate for gate in pod.spec.scheduling_gates if gate.name != gate_name + ] + pod.spec.affinity = { + 'nodeAffinity': { + 'requiredDuringSchedulingIgnoredDuringExecution': { + 'nodeSelectorTerms': [{ + 'matchExpressions': [{ + 'key': 'kubernetes.io/hostname', + 'operator': 'In', + 'values': [node_name], + }] + }] + } + } + } + pod.spec.scheduling_gates = new_gates + + v1.replace_namespaced_pod(pod_name, pod_namespace, pod) + + print(f'Pod {pod_namespace}/{pod_name} scheduled on {node_name}') + except ApiException as e: + print(f'Exception when removing scheduling gate: {e}') + + + def calculate_pods_assignment(sorted_nodes, sorted_pods): + """Calculates the best assignment for pods.""" + assignment = [-i for i in reversed(range(1, len(sorted_pods) + 1))] + best_assignment = [] + minimum_distance = 1000000000 + + while True: + all_ok = True + i = len(assignment) - 1 + while i >= 0 and all_ok: + assignment[i] += 1 + if assignment[i] == len(sorted_nodes): + break + if assignment[i] >= 0 and can_schedule( + sorted_nodes[assignment[i]], sorted_pods[i] + ): + i -= 1 + elif i < len(assignment) - 1 and assignment[i] == assignment[i + 1] - 1: + all_ok = False + if assignment[-1] == len(sorted_nodes): + break + if all_ok: + new_distance = 0 + for i in range(1, len(sorted_pods)): + new_distance += node_topology_distance( + sorted_nodes[assignment[i]], sorted_nodes[assignment[i - 1]] + ) + if new_distance < minimum_distance: + best_assignment = assignment.copy() + minimum_distance = new_distance + + return best_assignment + + + def schedule_pod_with_gate(v1, pods, gate): + pods_to_schedule = find_schedulable_pods(pods, gate) + + nodes = v1.list_node().items + print(f'Pods to schedule: {len(pods_to_schedule)}') + jobs = split_pods_based_on_jobs(pods_to_schedule.values()) + sorted_jobs = sorted(jobs, key=sort_jobs_by_time) + for job in sorted_jobs: + job_name = job[0].get('job_name') + creation_time = job[0].get('creation_time') + print(f'Attempting to schedule job: {job_name} created: {creation_time}') + + tolerated_taints = get_pods_taint_toleration(job) + nodes_to_schedule = find_schedulable_nodes(nodes, pods, tolerated_taints) + + sorted_pods = sorted(job, key=pod_sorting_key) + sorted_nodes = sorted(nodes_to_schedule.values(), key=node_topology_key) + + print(f'Nodes to schedule: {len(nodes_to_schedule)}') + + best_assignment = calculate_pods_assignment(sorted_nodes, sorted_pods) + + if not best_assignment: + print( + f'No scheduling for job: {job_name} with gate {gate} has been found.' + ' Skipping job.' + ) + continue + else: + print(f'Assignment found, scheduling {job_name} with {len(jobs)} pods.') + + for i in range(0, len(sorted_pods)): + pod = sorted_pods[i] + node = sorted_nodes[best_assignment[i]] + schedule_pod_on_node( + v1, pod['name'], pod['namespace'], node['name'], gate + ) + + + def run_scheduling_loop(): + """Runs scheduling.""" + parser = argparse.ArgumentParser( + prog='schedule-workload.py') + + parser.add_argument( + '-g', '--gate', + default='gke.io/topology-aware-auto-') # prefix of the schedule gate + parser.add_argument( + '-i', '--interval', + default=1.0) # intervals (in seconds) between scheduling + parser.add_argument( + '--ignored-namespace', + nargs='*', + default=[]) # namespace to search for pods + args = parser.parse_args() + + try: + kubernetes.config.load_incluster_config() + except kubernetes.config.ConfigException: + kubernetes.config.load_kube_config() + v1 = kubernetes.client.CoreV1Api() + + def list_pods(): + # filtering of namespace is not cached as namespaces could be + # created and deleted + namespaces = v1.list_namespace().items + filtered_namespace_names = [] + for n in namespaces: + if n.metadata.name not in args.ignored_namespace: + filtered_namespace_names.append(n.metadata.name) + pods = [] + for n in filtered_namespace_names: + pods += v1.list_namespaced_pod(n).items + return pods + + try: + t0 = time.time() + while True: + interval = time.time() - t0 + if interval < args.interval: + time.sleep(args.interval - interval) + t0 = time.time() + + pods = list_pods() + + gates = find_pod_gates(pods, args.gate) + print(f"Found {len(pods)} pods and {len(gates)} gates") + + if len(gates) == 0: + # No pods to be scheduled + continue + + # sleep for one seconds, assuming that all pods within one group would be + # all visible by then + time.sleep(5.0) + + for g in gates: + print(f"scheduling pods with gate {g}") + # query the pods again after the sleep, just in case not all gated pods + # are returned from previous query + pods = list_pods() + schedule_pod_with_gate(v1, pods, g) + + except ApiException as e: + print(f'Exception when listing Kubernetes nodes or pods: {e}') + + + if __name__ == '__main__': + run_scheduling_loop() + label-nodes-daemon.py: | + #!/usr/bin/env python + + # Copyright 2024 Google Inc. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + import time + + from kubernetes import client + from kubernetes import config + import requests + + + def update_node_labels(kube): + """Updates Kubernetes node labels based on GCE VM metadata.""" + node_name_url = "http://metadata.google.internal/computeMetadata/v1/instance/name" + metadata_url = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host" + headers = {"Metadata-Flavor": "Google"} + + response = requests.get(node_name_url, headers=headers) + + if response.status_code == 200: + node_name = response.text + else: + print("Node name not found") + return + + response = requests.get(metadata_url, headers=headers) + + if response.status_code == 200: + physical_host = response.text + else: + print("physical host not found") + return + + cluster, rack, host = physical_host.split("/")[1:] + + node_labels = { + "topology.gke.io/cluster": cluster, + "topology.gke.io/rack": rack, + "topology.gke.io/host": host, + } + + kube.patch_node(node_name, {"metadata": {"labels": node_labels}}) + print(f"Updated labels on node {node_name}: {node_labels}") + + + if __name__ == "__main__": + # Kubernetes configuration + config.load_incluster_config() + kube = client.CoreV1Api() + + while True: + print("Starting node update") + # Update node labels + update_node_labels(kube) + time.sleep(600) diff --git a/community/modules/compute/gke-topology-scheduler/variables.tf b/community/modules/compute/gke-topology-scheduler/variables.tf new file mode 100644 index 0000000000..0766091223 --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/variables.tf @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "The project ID to host the cluster in." + type = string +} + +variable "cluster_id" { + description = "projects/{{project}}/locations/{{location}}/clusters/{{cluster}}" + type = string +} diff --git a/community/modules/compute/gke-topology-scheduler/versions.tf b/community/modules/compute/gke-topology-scheduler/versions.tf new file mode 100644 index 0000000000..6c94438518 --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/versions.tf @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.3" + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:gke-topology-scheduler/v1.39.0" + } +} From 7595cb0232c75461df2416f58e3ffa9ec07d7dc4 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 27 Sep 2024 14:08:26 +0000 Subject: [PATCH 019/102] metadata file added --- .../compute/gke-topology-scheduler/README.md | 2 +- .../gke-topology-scheduler/metadata.yaml | 19 +++++++++++++++++++ .../gke-topology-scheduler/versions.tf | 4 ---- 3 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 community/modules/compute/gke-topology-scheduler/metadata.yaml diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md index ad4ea32cbd..5aaa4fca98 100644 --- a/community/modules/compute/gke-topology-scheduler/README.md +++ b/community/modules/compute/gke-topology-scheduler/README.md @@ -15,7 +15,7 @@ should be present for each GPU node in the cluster. The following example installs topology scheduler on a GKE cluster. ```yaml - - id: topology_aware_scheduler_install +- id: topology_aware_scheduler_install source: community/modules/compute/gke-topology-scheduler use: [gke_cluster] ``` diff --git a/community/modules/compute/gke-topology-scheduler/metadata.yaml b/community/modules/compute/gke-topology-scheduler/metadata.yaml new file mode 100644 index 0000000000..17bedb471b --- /dev/null +++ b/community/modules/compute/gke-topology-scheduler/metadata.yaml @@ -0,0 +1,19 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - container.googleapis.com diff --git a/community/modules/compute/gke-topology-scheduler/versions.tf b/community/modules/compute/gke-topology-scheduler/versions.tf index 6c94438518..adcbea8ca2 100644 --- a/community/modules/compute/gke-topology-scheduler/versions.tf +++ b/community/modules/compute/gke-topology-scheduler/versions.tf @@ -14,8 +14,4 @@ terraform { required_version = ">= 1.3" - - provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-topology-scheduler/v1.39.0" - } } From 6ee205bf48ec5a23c62af2349ee15818e2b22046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Mon, 30 Sep 2024 17:16:28 +0000 Subject: [PATCH 020/102] Ensure enough open files limit for Parallelstore --- modules/file-system/parallelstore/scripts/mount-daos.sh | 3 +++ .../pre-existing-network-storage/scripts/mount-daos.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh index e2500c93a5..2b09f2e6d4 100644 --- a/modules/file-system/parallelstore/scripts/mount-daos.sh +++ b/modules/file-system/parallelstore/scripts/mount-daos.sh @@ -65,6 +65,9 @@ chmod 777 "$local_mount" fuse_config=/etc/fuse.conf sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config +# make sure limit of open files is high enough for dfuse (1M of open files) +ulimit -n 1048576 + for i in {1..10}; do # To parse mount_options as --disable-wb-cache --eq-count=8. # shellcheck disable=SC2086 diff --git a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh index e2500c93a5..2b09f2e6d4 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh @@ -65,6 +65,9 @@ chmod 777 "$local_mount" fuse_config=/etc/fuse.conf sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config +# make sure limit of open files is high enough for dfuse (1M of open files) +ulimit -n 1048576 + for i in {1..10}; do # To parse mount_options as --disable-wb-cache --eq-count=8. # shellcheck disable=SC2086 From ed78494d00bfd1561ef25c45a07ae8bdb0b6442a Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 30 Sep 2024 19:13:42 +0000 Subject: [PATCH 021/102] inlude nccl test instruction in output for sample workload --- modules/compute/gke-node-pool/README.md | 22 +++++++++++----------- modules/compute/gke-node-pool/outputs.tf | 8 ++++++++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fcf7414af6..78c4dd1dd7 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -294,26 +294,26 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | -| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 8be6a2772a..58216e957f 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -80,6 +80,14 @@ locals { You can use the following commands to submit the sample job: kubectl create -f ${abspath(local.gpu_direct_setting.updated_workload_path)} + After submitting the sample job, you can validate the GPU performance by initiating NCCL test included in the sample workload: + NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods: + export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1) + export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}') + kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS + Depends on the Msg size used for transmission in the test, the busbw would different a bit. + For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s + For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s If you would like to enable GPUDirect for your own workload, please follow the below steps: export WORKLOAD_PATH=<> From fcd6eb3348fb6dde411dd3a080eff9bf5e28eb54 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 30 Sep 2024 17:29:58 -0500 Subject: [PATCH 022/102] Move a3-megagpu-8g tests to us-west4-a due to available capacity for testing --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 1 + tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml | 4 ++-- .../daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml | 3 +-- .../daily-tests/tests/ml-a3-megagpu-slurm-image.yml | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 5b39c04792..8d46b10c40 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -20,6 +20,7 @@ blueprint_name: a3mega-cluster vars: deployment_name: a3mega-cluster a3mega_partition_name: a3mega + a3mega_maintenance_interval: "" enable_placement: false remote_mount_homefs: /nfsshare local_mount_homefs: /home diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml index e066cbff27..f24fb0ffd5 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml @@ -70,8 +70,8 @@ steps: cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet } - REGION=australia-southeast1 - ZONE=australia-southeast1-c + REGION=us-west4 + ZONE=us-west4-a trap 'destroy_on_exit' EXIT ./gcluster deploy \ diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml index 00cfcf0c76..ef60518e9f 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-cluster.yml @@ -46,6 +46,5 @@ cli_deployment_vars: a3mega_cluster_size: 2 enable_ops_agent: "true" enable_nvidia_dcgm: "true" - a3mega_reservation_name: a3mega-reservation-australia-southeast1-c - a3mega_maintenance_interval: PERIODIC + a3mega_reservation_name: a3mega-reservation-0 final_image_family: "{{ final_image_family }}" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml index 4c5c9175e5..d4c4e31c18 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-image.yml @@ -24,8 +24,8 @@ delete_image: false cli_deployment_vars: network_name_system: default subnetwork_name_system: default - region: us-west1 - zone: us-west1-a + region: us-west4 + zone: us-west4-a enable_ops_agent: true enable_nvidia_dcgm: true slurm_cluster_name: a3mc{{ build[0:4] }} From ab41e09cba54d3055a29a354629a3a594d582e4b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 28 Sep 2024 00:58:09 +0000 Subject: [PATCH 023/102] Add clean up for TPUs --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/cleanup_tpu/README.md | 79 +++++++++++++++++++ .../modules/cleanup_tpu/main.tf | 32 ++++++++ .../cleanup_tpu/scripts/cleanup_tpu.sh | 63 +++++++++++++++ .../modules/cleanup_tpu/variables.tf | 60 ++++++++++++++ .../modules/cleanup_tpu/versions.tf | 27 +++++++ .../partition.tf | 15 ++-- 7 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/versions.tf diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index b9cb9d6d95..40d24732a6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -237,7 +237,7 @@ limitations under the License. | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | -| [nodeset\_tpu\_cleanup](#module\_nodeset\_tpu\_cleanup) | ./modules/cleanup_compute | n/a | +| [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | | [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md new file mode 100644 index 0000000000..61a08c700f --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/README.md @@ -0,0 +1,79 @@ +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [null](#requirement\_null) | >= 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [null](#provider\_null) | 3.2.3 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of TPU nodes managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed TPU nodes will be destroyed. | `bool` | n/a | yes | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
| n/a | yes | +| [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes | +| [nodeset](#input\_nodeset) | Nodeset to cleanup |
object({
nodeset_name = string
zone = string
})
| n/a | yes | +| [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes | + +## Outputs + +No outputs. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [null](#requirement\_null) | >= 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [null](#provider\_null) | >= 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of TPU nodes managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed TPU nodes will be destroyed. | `bool` | n/a | yes | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
| n/a | yes | +| [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes | +| [nodeset](#input\_nodeset) | Nodeset to cleanup |
object({
nodeset_name = string
zone = string
})
| n/a | yes | +| [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf new file mode 100644 index 0000000000..ec86a03a24 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/main.tf @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "null_resource" "script" { + count = var.enable_cleanup_compute ? 1 : 0 + + triggers = { + project_id = var.project_id + cluster_name = var.slurm_cluster_name + nodeset_name = var.nodeset.nodeset_name + zone = var.nodeset.zone + universe_domain = var.universe_domain + compute_endpoint_version = var.endpoint_versions.compute + gcloud_path_override = var.gcloud_path_override + } + + provisioner "local-exec" { + command = "/bin/bash ${path.module}/scripts/cleanup_tpu.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.zone} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" + when = destroy + } +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh new file mode 100755 index 0000000000..c724e342c3 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/scripts/cleanup_tpu.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e -o pipefail + +project="$1" +cluster_name="$2" +nodeset_name="$3" +zone="$4" +universe_domain="$5" +compute_endpoint_version="$6" +gcloud_dir="$7" + +if [[ $# -ne 6 ]] && [[ $# -ne 7 ]]; then + echo "Usage: $0 []" + exit 1 +fi + +if [[ -n "${gcloud_dir}" ]]; then + export PATH="$gcloud_dir:$PATH" +fi + +export CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE="https://www.${universe_domain}/compute/${compute_endpoint_version}/" +export CLOUDSDK_CORE_PROJECT="${project}" + +if ! type -P gcloud 1>/dev/null; then + echo "gcloud is not available and your compute resources are not being cleaned up" + echo "https://console.cloud.google.com/compute/instances?project=${project}" + exit 1 +fi + +echo "Deleting TPU nodes" +node_filter="name~${cluster_name}-${nodeset_name}" +running_nodes_filter="${node_filter} AND state!=DELETING" + +# List all currently running nodes and attempt to delete them +gcloud compute tpus tpu-vm list --zone="${zone}" --format="value(name)" --filter="${running_nodes_filter}" | while read -r name; do + echo "Deleting TPU node: $name" + gcloud compute tpus tpu-vm delete --async --zone="${zone}" --quiet "${name}" || echo "Failed to delete $name" +done + +# Wait until nodes in DELETING state are deleted, before deleting the resource policies +deleting_nodes_filter="${node_filter} AND state=DELETING" +while true; do + node=$(gcloud compute tpus tpu-vm list --zone="${zone}" --format="value(name)" --filter="${deleting_nodes_filter}" --limit=1) + if [[ -z "${node}" ]]; then + break + fi + echo "Waiting for nodes to be deleted: ${node}" + sleep 5 +done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf new file mode 100644 index 0000000000..1ac6f64b75 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_tpu/variables.tf @@ -0,0 +1,60 @@ +/** + * Copyright (C) Google LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + type = string + description = "Project ID" +} + +variable "slurm_cluster_name" { + type = string + description = "Name of the Slurm cluster" +} + +variable "enable_cleanup_compute" { + description = < Date: Thu, 19 Sep 2024 20:14:43 +0000 Subject: [PATCH 024/102] Move to SlurmGCP image 6.7 --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- community/examples/hpc-slurm-ubuntu2004.yaml | 2 +- community/examples/hpc-slurm6-apptainer.yaml | 2 +- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 4 ++-- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../source_image_logic.tf | 8 ++++---- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../source_image_logic.tf | 8 ++++---- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v6-controller/controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 4 ++-- .../source_image_logic.tf | 8 ++++---- .../variables_controller_instance.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-login/README.md | 2 +- .../source_image_logic.tf | 8 ++++---- .../schedmd-slurm-gcp-v6-login/variables.tf | 2 +- examples/cae/cae-slurm.yaml | 2 +- examples/hpc-enterprise-slurm.yaml | 2 +- examples/hpc-slurm-static.yaml | 2 +- examples/image-builder.yaml | 2 +- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- examples/ml-slurm.yaml | 2 +- .../daily-tests/blueprints/lustre-slurm.yaml | 4 ++-- .../daily-tests/tests/slurm-v6-debian.yml | 2 +- .../golden_copies/configs/versioned_blueprint.yaml | 2 +- .../.ghpc/artifacts/expanded_blueprint.yaml | 2 +- .../versioned_blueprint/primary/terraform.tfvars | 2 +- .../validate_configs/test_configs/node-groups.yaml | 6 +++--- 34 files changed, 59 insertions(+), 59 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 5decf96a2d..282d5b7816 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -168,7 +168,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public - id: low_cost_nodeset diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index ed3a587fb9..7e89520c05 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -24,7 +24,7 @@ vars: slurm_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-6-6-ubuntu-2004-lts + family: slurm-gcp-6-7-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 6848b1b4f0..47e9c267aa 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 86fcc2e9e7..72f4fccb9f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | ## Resources @@ -104,7 +104,7 @@ modules. For support with the underlying modules, see the instructions in the | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 2314471ac9..7e547c3d5f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" project_id = var.project_id region = var.region diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index 57e909b9a5..a86c28ffc2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-6-debian-11", - "slurm-gcp-6-6-hpc-rocky-linux-8", - "slurm-gcp-6-6-ubuntu-2004-lts", - "slurm-gcp-6-6-ubuntu-2204-lts-arm64" + "slurm-gcp-6-7-debian-11", + "slurm-gcp-6-7-hpc-rocky-linux-8", + "slurm-gcp-6-7-ubuntu-2004-lts", + "slurm-gcp-6-7-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 3c8b0743dd..5d5f71c9c0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -68,7 +68,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-6-hpc-rocky-linux-8" + family = "slurm-gcp-6-7-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 14d945c9a5..fac8a63d44 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -59,7 +59,7 @@ No resources. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf- | `string` | `null` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 3761707b3e..30e8d5c177 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -103,7 +103,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-" type = string default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 5eb8ba6665..117e0ca0e5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -178,7 +178,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index 57e909b9a5..a86c28ffc2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-6-debian-11", - "slurm-gcp-6-6-hpc-rocky-linux-8", - "slurm-gcp-6-6-ubuntu-2004-lts", - "slurm-gcp-6-6-ubuntu-2204-lts-arm64" + "slurm-gcp-6-7-debian-11", + "slurm-gcp-6-7-hpc-rocky-linux-8", + "slurm-gcp-6-7-ubuntu-2004-lts", + "slurm-gcp-6-7-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 9609725952..aeb2435bd0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -88,7 +88,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-6-hpc-rocky-linux-8" + family = "slurm-gcp-6-7-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 40d24732a6..30f002d68f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -238,13 +238,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.7.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.7.0 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.7.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.0 | ## Resources @@ -301,7 +301,7 @@ limitations under the License. | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 49bc366f21..0148323597 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" project_id = var.project_id region = var.region @@ -99,7 +99,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index ab1123ad19..d9cb38ff07 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 753fe6512c..9be62f82f7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" for_each = local.nodeset_map project_id = var.project_id @@ -101,7 +101,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.7.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.0" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index 57e909b9a5..a86c28ffc2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-6-debian-11", - "slurm-gcp-6-6-hpc-rocky-linux-8", - "slurm-gcp-6-6-ubuntu-2004-lts", - "slurm-gcp-6-6-ubuntu-2204-lts-arm64" + "slurm-gcp-6-7-debian-11", + "slurm-gcp-6-7-hpc-rocky-linux-8", + "slurm-gcp-6-7-ubuntu-2004-lts", + "slurm-gcp-6-7-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 69eea81844..0df835e322 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -267,7 +267,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-6-hpc-rocky-linux-8" + family = "slurm-gcp-6-7-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index ee6fd367c6..0afd0bfee7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -100,7 +100,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index 57e909b9a5..a86c28ffc2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-6-debian-11", - "slurm-gcp-6-6-hpc-rocky-linux-8", - "slurm-gcp-6-6-ubuntu-2004-lts", - "slurm-gcp-6-6-ubuntu-2204-lts-arm64" + "slurm-gcp-6-7-debian-11", + "slurm-gcp-6-7-hpc-rocky-linux-8", + "slurm-gcp-6-7-ubuntu-2004-lts", + "slurm-gcp-6-7-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index f7d4cacd85..2b53c8f9e5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -325,7 +325,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-6-hpc-rocky-linux-8" + family = "slurm-gcp-6-7-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index a3e9820ab9..34096a7080 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -40,7 +40,7 @@ vars: # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 3ef0ba990f..d7520d3b85 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -25,7 +25,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index fff15e07dc..07ed2a4690 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -29,7 +29,7 @@ vars: static_node_count: 2 ## Must be <= number of reserved machines ## slurm_instance_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: false # true if using custom image in lines above bandwidth_tier: gvnic_enabled diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 63f5d89fbd..715948b0dd 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -59,7 +59,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 6540c18954..c50454739e 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -94,7 +94,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.7.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index b3c44273fa..6ba58f0308 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.7.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 81a78b59a1..4baaaf07ce 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -139,7 +139,7 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-6-debian-11 + source_image_family: slurm-gcp-6-7-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 2ee69cf821..44900430a7 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -79,7 +79,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-6-6-ubuntu-2004-lts + # family: slurm-gcp-6-7-ubuntu-2004-lts # project: schedmd-slurm-public # - id: ubuntu_partition diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml index 77bbea5edc..8d5e724b0b 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "debiv6{{ build[0:4] }}" cli_deployment_vars: network_name: "{{ network }}" - slurm_image: "{family: slurm-gcp-6-6-debian-11, project: schedmd-slurm-public}" + slurm_image: "{family: slurm-gcp-6-7-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index db6c920704..6344dd8d76 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -27,7 +27,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index cff3ce442f..ba7ec541b3 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,7 +39,7 @@ vars: project_id: invalid-project region: us-central1 slurm_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public zone: us-central1-a deployment_groups: diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars index 0e31c36a07..39fad882b4 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars @@ -30,7 +30,7 @@ project_id = "invalid-project" region = "us-central1" slurm_image = { - family = "slurm-gcp-6-6-hpc-rocky-linux-8" + family = "slurm-gcp-6-7-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index ca6a7b9c89..cfb166cbb5 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-6-6-debian-11 + family: slurm-gcp-6-7-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -75,7 +75,7 @@ deployment_groups: name: c60 machine_type: c2-standard-60 instance_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public - id: nodeset_3 @@ -85,7 +85,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-6-6-hpc-rocky-linux-8 + family: slurm-gcp-6-7-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true enable_smt: true From f06758ab02732d022bfe57dea81b930dd88b38e4 Mon Sep 17 00:00:00 2001 From: ChengcongDu Date: Tue, 1 Oct 2024 16:25:54 +0000 Subject: [PATCH 025/102] remove expected performanc note --- modules/compute/gke-node-pool/outputs.tf | 3 --- 1 file changed, 3 deletions(-) diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 58216e957f..7bcd0c6361 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -85,9 +85,6 @@ locals { export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1) export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}') kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS - Depends on the Msg size used for transmission in the test, the busbw would different a bit. - For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s - For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s If you would like to enable GPUDirect for your own workload, please follow the below steps: export WORKLOAD_PATH=<> From eaf1af7a5d4712e2ba9e712eb97729de39c65b38 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 1 Oct 2024 22:01:33 -0500 Subject: [PATCH 026/102] Modify triggers that run pre-commit validation - do not run when user labels the PR (does not change code) - do not run when PR title/description or base branch is edited - run when PR is re-opened after having been closed --- .github/workflows/pr-precommit.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index 37234d2a0e..5b1b5091cf 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -19,9 +19,8 @@ name: 'Use pre-commit to validate Pull Request' on: pull_request: types: - - edited - opened - - labeled + - reopened - synchronize branches: - main From 35cdab35ca4f690172b7f5dbd9377a19bcb26c68 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 1 Oct 2024 22:06:40 -0500 Subject: [PATCH 027/102] Require labels on pull requests directly to main and release-candidate branches --- .github/workflows/pr-label-validation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index 9fe508fadf..df54a6e150 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -28,7 +28,9 @@ on: - ready_for_review - unlocked branches: + - main - develop + - release-candidate jobs: pr-label-validation: From 69521206fce98b306b64e93adc2853c36add8ec1 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 2 Oct 2024 09:33:56 -0500 Subject: [PATCH 028/102] Chrome Remote Desktop: update apt cache only if stale Update the behavior of Ansible to update the apt cache only if it is stale (more than 1 hour old). In practice, the apt cache is unlikely to be stale because Ansible was just installed by pip, which requires several packages to be installed. --- .../scripts/configure-chrome-desktop.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml index 41928f9294..2daea9cd28 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml +++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml @@ -16,6 +16,10 @@ - name: Ensure Desktop OS and Chrome Remote Desktop is installed hosts: localhost become: true + module_defaults: + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 tasks: - name: Install desktop packages ansible.builtin.apt: @@ -23,7 +27,6 @@ - xfce4 - xfce4-goodies state: present - update_cache: true register: apt_result retries: 6 delay: 10 From 4bb630fef34adb38444b1c8386067827ce0dc01b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 2 Oct 2024 09:35:07 -0500 Subject: [PATCH 029/102] Chrome Remote Desktop: increase retry time for apt We have observed failures of this module when unattended-upgrades is running simultaneously to the installation of xfce4. This increases the retry duration from 1 minute to 5 minutes and the number of retries from 6 to 10 (a total of 11 attempts). When we adopt ansible-core 2.12 or later, we should use the lock_timeout feature more directly: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/apt_module.html#parameter-lock_timeout --- .../scripts/configure-chrome-desktop.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml index 2daea9cd28..391aa86433 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml +++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-chrome-desktop.yml @@ -28,8 +28,8 @@ - xfce4-goodies state: present register: apt_result - retries: 6 - delay: 10 + retries: 10 + delay: 30 until: apt_result is success - name: Download and configure CRD @@ -45,8 +45,8 @@ environment: DEBIAN_FRONTEND: noninteractive register: apt_result - retries: 6 - delay: 10 + retries: 10 + delay: 30 until: apt_result is success - name: Configure CRD to use Xfce by default From 7780f46e729c237e197947b5c6324257734bee20 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 2 Oct 2024 19:53:33 +0000 Subject: [PATCH 030/102] wordings updated --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/gpu_direct.tf | 18 +++++++++--------- modules/compute/gke-node-pool/variables.tf | 4 ++-- modules/scheduler/gke-cluster/README.md | 2 +- modules/scheduler/gke-cluster/outputs.tf | 4 ++-- .../pre-existing-gke-cluster/README.md | 2 +- .../pre-existing-gke-cluster/outputs.tf | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 7b1cffbf68..03652cf29e 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -294,7 +294,7 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [gke\_master\_version](#input\_gke\_master\_version) | GKE master version | `string` | n/a | yes | +| [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 4fef57e914..27c61f0256 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -33,7 +33,7 @@ locals { updated_workload_path = replace(local.workload_path_tcpx, ".yaml", "-tcpx.yaml") rxdm_version = "v2.0.12" # matching nccl-tcpx-installer version v3.1.9 min_additional_networks = 4 - min_gke_versions = { + major_minor_version_acceptable_map = { "1.27" = "1.27.7-gke.1121000" "1.28" = "1.28.8-gke.1095000" "1.29" = "1.29.3-gke.1093000" @@ -49,7 +49,7 @@ locals { updated_workload_path = replace(local.workload_path_tcpxo, ".yaml", "-tcpxo.yaml") rxdm_version = "v1.0.10" # matching nccl-tcpxo-installer version v1.0.4 min_additional_networks = 8 - min_gke_versions = { + major_minor_version_acceptable_map = { "1.28" = "1.28.9-gke.1250000" "1.29" = "1.29.4-gke.1542000" "1.30" = "1.30.4-gke.1129000" @@ -61,13 +61,13 @@ locals { gke_version_regex = "(\\d+\\.\\d+)\\.(\\d+)-gke\\.(\\d+)" # GKE version format: 1.X.Y-gke.Z , regex output: ["1.X" , "Y", "Z"] - gke_version_parts = regex(local.gke_version_regex, var.gke_master_version) + gke_version_parts = regex(local.gke_version_regex, var.gke_version) gke_version_major = local.gke_version_parts[0] - min_gke_versions = try(local.gpu_direct_setting[var.machine_type].min_gke_versions, null) - min_version = try(contains(keys(local.min_gke_versions), local.gke_version_major), false) ? local.min_gke_versions[local.gke_version_major] : "1.0.0-gke.0" - min_version_parts = regex(local.gke_version_regex, local.min_version) - gke_gpudirect_compatible = local.gke_version_parts[1] > local.min_version_parts[1] || (local.gke_version_parts[1] == local.min_version_parts[1] && local.gke_version_parts[2] >= local.min_version_parts[2]) + major_minor_version_acceptable_map = try(local.gpu_direct_setting[var.machine_type].major_minor_version_acceptable_map, null) + minor_version_acceptable = try(contains(keys(local.major_minor_version_acceptable_map), local.gke_version_major), false) ? local.major_minor_version_acceptable_map[local.gke_version_major] : "1.0.0-gke.0" + minor_version_acceptable_parts = regex(local.gke_version_regex, local.minor_version_acceptable) + gke_gpudirect_compatible = local.gke_version_parts[1] > local.minor_version_acceptable_parts[1] || (local.gke_version_parts[1] == local.minor_version_acceptable_parts[1] && local.gke_version_parts[2] >= local.minor_version_acceptable_parts[2]) } check "gpu_direct_check_multi_vpc" { @@ -77,9 +77,9 @@ check "gpu_direct_check_multi_vpc" { } } -check "gke_master_version_requirements" { +check "gke_version_requirements" { assert { condition = local.gke_gpudirect_compatible - error_message = "GPUDirect is not supported on GKE master version ${var.gke_master_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 62160a2448..b24aef91df 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -361,7 +361,7 @@ variable "initial_node_count" { default = null } -variable "gke_master_version" { - description = "GKE master version" +variable "gke_version" { + description = "GKE version" type = string } diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 4548db2fc9..583af203da 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -194,7 +194,7 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the resource with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster has been created. Needed by community/modules/scripts/kubernetes-operations. | -| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | +| [gke\_version](#output\_gke\_version) | GKE cluster's version. | | [instructions](#output\_instructions) | Instructions on how to connect to the created cluster. | | [k8s\_service\_account\_name](#output\_k8s\_service\_account\_name) | Name of k8s service account. | diff --git a/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf index 4daed8ee25..28e00171ff 100644 --- a/modules/scheduler/gke-cluster/outputs.tf +++ b/modules/scheduler/gke-cluster/outputs.tf @@ -75,7 +75,7 @@ output "k8s_service_account_name" { value = one(module.workload_identity[*].k8s_service_account_name) } -output "gke_master_version" { - description = "GKE cluster's master version." +output "gke_version" { + description = "GKE cluster's version." value = google_container_cluster.gke_cluster.master_version } diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 1f2904d889..4caf7ff258 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -111,5 +111,5 @@ limitations under the License. |------|-------------| | [cluster\_id](#output\_cluster\_id) | An identifier for the gke cluster with format projects/{{project\_id}}/locations/{{region}}/clusters/{{name}}. | | [gke\_cluster\_exists](#output\_gke\_cluster\_exists) | A static flag that signals to downstream modules that a cluster exists. | -| [gke\_master\_version](#output\_gke\_master\_version) | GKE cluster's master version. | +| [gke\_version](#output\_gke\_version) | GKE cluster's version. | diff --git a/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf index 90772d3dae..8884ee30b0 100644 --- a/modules/scheduler/pre-existing-gke-cluster/outputs.tf +++ b/modules/scheduler/pre-existing-gke-cluster/outputs.tf @@ -27,7 +27,7 @@ output "gke_cluster_exists" { ] } -output "gke_master_version" { - description = "GKE cluster's master version." +output "gke_version" { + description = "GKE cluster's version." value = data.google_container_cluster.existing_gke_cluster.master_version } From 14864db2a7e9c8587cd8caeb5d288d0ba0266a34 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 2 Oct 2024 21:45:48 +0000 Subject: [PATCH 031/102] minor wording update --- modules/compute/gke-node-pool/gpu_direct.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/gpu_direct.tf b/modules/compute/gke-node-pool/gpu_direct.tf index 27c61f0256..00dd298971 100644 --- a/modules/compute/gke-node-pool/gpu_direct.tf +++ b/modules/compute/gke-node-pool/gpu_direct.tf @@ -80,6 +80,6 @@ check "gpu_direct_check_multi_vpc" { check "gke_version_requirements" { assert { condition = local.gke_gpudirect_compatible - error_message = "GPUDirect is not supported on GKE master version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" + error_message = "GPUDirect is not supported on GKE version ${var.gke_version} for ${var.machine_type} machine. For supported version details visit https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#requirements" } } From 1e9caf1644b0a7bea4e2f44fe1d0f9d0c4ee44af Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 3 Oct 2024 16:26:09 -0500 Subject: [PATCH 032/102] Adopt Google terraform provider plugin 5.44.x --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 795343a3b1..3a8898306d 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.39.0", + Version: ">= 4.84.0, < 5.45.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.39.0", + Version: ">= 4.84.0, < 5.45.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 6c347aceb9..40fc192175 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.39.0"}, + Version: ">= 4.84.0, < 5.45.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.39.0"}}) + Version: ">= 4.84.0, < 5.45.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index efa8f25bfb..32d7d818a8 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 792917c317..6630b9b8c6 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index b25ddd135b..8a160967a2 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -79,14 +79,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 792917c317..6630b9b8c6 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 792917c317..6630b9b8c6 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 71103dd046..9c97a650eb 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 792917c317..6630b9b8c6 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index ba7ec541b3..4e74f8d305 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.39.0' + version: '>= 4.84.0, < 5.45.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 792917c317..6630b9b8c6 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.39.0" + version = ">= 4.84.0, < 5.45.0" } } } From 9cb2ebe84267ae75fd09a267f860b81085b1842a Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 04:54:44 +0000 Subject: [PATCH 033/102] Chunk BigQuery sacct row inserts --- .../modules/slurm_files/scripts/load_bq.py | 18 ++++++--- .../slurm_files/scripts/tests/test_load_bq.py | 39 +++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 800202d2ea..9967069212 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -19,23 +19,23 @@ import shelve import uuid from collections import namedtuple -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from pathlib import Path from pprint import pprint -from google.cloud.bigquery import SchemaField +from google.api_core import exceptions, retry from google.cloud import bigquery as bq -from google.api_core import retry, exceptions +from google.cloud.bigquery import SchemaField import util from util import lookup, run - SACCT = "sacct" script = Path(__file__).resolve() DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp" timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE)) +BQ_MAX_ROW_LOAD_SIZE = 10000 # cluster_id_file = script.parent / 'cluster_uuid' # try: @@ -321,8 +321,16 @@ def main(): # on failure, an exception will cause the timestamp not to be rewritten. So # it will try again next time. If some writes succeed, we don't currently # have a way to not submit duplicates next time. + print(f"loading BigQuery data in batches of size : {BQ_MAX_ROW_LOAD_SIZE}") + num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1 + print(f"Number of batches: {num_batches}") if jobs: - bq_submit(jobs) + start_job_idx = 0 + end_job_idx = BQ_MAX_ROW_LOAD_SIZE + for _ in range(num_batches): + bq_submit(jobs[start_job_idx:end_job_idx]) + start_job_idx = end_job_idx + end_job_idx += BQ_MAX_ROW_LOAD_SIZE write_timestamp(end) update_job_idx_cache(jobs, end) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py new file mode 100644 index 0000000000..ebe45008a0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py @@ -0,0 +1,39 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +# test the chunking logic if not the BigQuery loads themselves +@pytest.mark.parametrize("num_jobs_to_load", (0, 11, 10001, 51131, 104321)) +def test_chunked_bq_load(num_jobs_to_load: int): + BQ_MAX_ROW_LOAD_SIZE = 10000 + jobs = [i + 1 for i in range(num_jobs_to_load)] + num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1 + print(num_batches) + load_cache = [] + if jobs: + start_job_idx = 0 + end_job_idx = BQ_MAX_ROW_LOAD_SIZE + for _ in range(num_batches): + load_cache.append(jobs[start_job_idx:end_job_idx]) + start_job_idx = end_job_idx + end_job_idx += BQ_MAX_ROW_LOAD_SIZE + if jobs: + assert ( + sum([sum(x) for x in load_cache]) + == num_jobs_to_load * (num_jobs_to_load + 1) // 2 + ) + else: + assert sum([sum(x) for x in load_cache]) == 0 From 902ffd0ae29247e877f442f2b19273bab5c3c54b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 7 Oct 2024 05:34:01 +0000 Subject: [PATCH 034/102] SlurmGCP. Do not put job_submit script into config. * Make `job_submit` a part of "devel" zip; * Move from `etc` to `scripts`; * Remove `slurm_files` variable; * Apply auto-formating, no other changes to the `job_submit.lua.tpl`. --- .../modules/slurm_files/README.md | 1 - .../slurm_files/etc/job_submit.lua.tpl | 102 ----------------- .../modules/slurm_files/main.tf | 1 - .../modules/slurm_files/scripts/conf.py | 19 ++-- .../slurm_files/scripts/job_submit.lua.tpl | 103 ++++++++++++++++++ .../modules/slurm_files/variables.tf | 6 - 6 files changed, 113 insertions(+), 119 deletions(-) delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 8cf7f3ade5..3033d59f43 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -83,7 +83,6 @@ No modules. | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | -| [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | | [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
- server\_ip : Address of the storage server.
- remote\_mount : The location in the remote instance filesystem to mount from.
- local\_mount : The location on the instance filesystem to mount to.
- fs\_type : Filesystem type (e.g. "nfs").
- mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | | [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl deleted file mode 100644 index f3c9b0750e..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl +++ /dev/null @@ -1,102 +0,0 @@ -SCRIPTS_DIR = "{scripts_dir}" -NO_VAL = 4294967294 ---get_tpu_vmcount.py exit code -PART_INVALID = -1 --partition does not exists in config.yaml, thus do not exist in slurm -DIFF_VMCOUNTS_SAME_PART = -2 --in the same partition there are nodesets with different vmcounts -DIFF_PART_DIFFERENT_VMCOUNTS = -3 --partition is a list of partitions in which at least two of them have different vmcount -UNKWOWN_ERROR = -4 --get_tpu_vmcount.py did not return a valid response - -function get_part(job_desc,part_list) - if job_desc.partition then - return job_desc.partition - end - for name,val in pairs(part_list) do - if val.flag_default == 1 then - return name - end - end - return nil -end - -function os.capture(cmd, raw) - local handle = assert(io.popen(cmd, 'r')) - local output = assert(handle:read('*a')) - handle:close() - return output -end - -function get_vmcount(part) - local cmd = SCRIPTS_DIR .. "/get_tpu_vmcount.py -p " .. part - local out = os.capture(cmd,true) - for line in out:gmatch("(.-)\r?\n") do - local tag, val = line:match("([^:]+):([^:]+)") - if tag == "VMCOUNT" then - return tonumber(val) - end - end - return UNKWOWN_ERROR -end - - -function slurm_job_submit(job_desc, part_list, submit_uid) - local part = get_part(job_desc,part_list) - local vmcount = get_vmcount(part) - --Only do something if the job is in a TPU partition, if vmcount is 0, it implies that the partition(s) specified are not TPU ones - if vmcount == 0 then - return slurm.SUCCESS - end - --This is a TPU job, but as the vmcount is 1 it can he handled the same way - if vmcount == 1 then - return slurm.SUCCESS - end - --Check for errors - if vmcount == PART_INVALID then - slurm.log_user("Invalid partition specified " .. part) - return slurm.FAILURE - end - if vmcount == DIFF_VMCOUNTS_SAME_PART then - slurm.log_user("In partition(s) " .. part .. " there are more than one tpu nodeset vmcount, this should not happen.") - return slurm.ERROR - end - if vmcount == DIFF_PART_DIFFERENT_VMCOUNTS then - slurm.log_user("In partition list " .. part .. " there are more than one TPU types, cannot determine which is the correct vmcount to use, please retry with only one partition.") - return slurm.FAILURE - end - if vmcount == UNKWOWN_ERROR then - slurm.log_user("Something went wrong while executing get_tpu_vmcount.py.") - return slurm.ERROR - end - --This is surely a TPU node - if vmcount > 1 then - local min_nodes = job_desc.min_nodes - local max_nodes = job_desc.max_nodes - --if not specified assume it is one, this should be improved taking into account the cpus, mem, and other factors - if min_nodes == NO_VAL then - min_nodes = 1 - max_nodes = 1 - end - --as max_nodes can be higher than the nodes in the partition, we are not able to calculate with certainty the nodes that this job will have if this value is set to something - --different than min_nodes - if min_nodes ~= max_nodes then - slurm.log_user("Max nodes cannot be set different than min nodes for the TPU partitions.") - return slurm.ERROR - end - --Set the number of switches to the number of nodes originally requested by the job, as the job requests "TPU groups" - job_desc.req_switch = min_nodes - - --Apply the node increase into the job description. - job_desc.min_nodes = min_nodes * vmcount - job_desc.max_nodes = max_nodes * vmcount - --if job_desc.features then - --slurm.log_user("Features: %s",job_desc.features) - --end - end - - return slurm.SUCCESS -end - -function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) - return slurm.SUCCESS -end - -return slurm.SUCCESS diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 0cf9981f5a..959d928176 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -84,7 +84,6 @@ locals { slurmdbd_conf_tpl = file(coalesce(var.slurmdbd_conf_tpl, "${local.etc_dir}/slurmdbd.conf.tpl")) slurm_conf_tpl = file(coalesce(var.slurm_conf_tpl, "${local.etc_dir}/slurm.conf.tpl")) cgroup_conf_tpl = file(coalesce(var.cgroup_conf_tpl, "${local.etc_dir}/cgroup.conf.tpl")) - jobsubmit_lua_tpl = file(coalesce(var.job_submit_lua_tpl, "${local.etc_dir}/job_submit.lua.tpl")) # Providers endpoint_versions = var.endpoint_versions diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index c3b31f20a2..29b4076056 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -360,19 +360,20 @@ def install_cgroup_conf(lkp: util.Lookup) -> None: def install_jobsubmit_lua(lkp: util.Lookup) -> None: """install job_submit.lua if there are tpu nodes in the cluster""" - if any( + if not any( tpu_nodeset is not None for part in lkp.cfg.partitions.values() for tpu_nodeset in part.partition_nodeset_tpu ): - conf_options = { - "scripts_dir": lkp.cfg.slurm_scripts_dir or dirs.scripts, - } - conf = lkp.cfg.jobsubmit_lua_tpl.format(**conf_options) - - conf_file = lkp.etc_dir / "job_submit.lua" - conf_file.write_text(conf) - util.chown_slurm(conf_file, 0o600) + return # No TPU partitions, no need for job_submit.lua + + scripts_dir = lkp.cfg.slurm_scripts_dir or dirs.scripts + tpl = (scripts_dir / "job_submit.lua.tpl").read_text() + conf = tpl.format(scripts_dir=scripts_dir) + + conf_file = lkp.etc_dir / "job_submit.lua" + conf_file.write_text(conf) + util.chown_slurm(conf_file, 0o600) def gen_cloud_gres_conf(lkp: util.Lookup) -> None: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl new file mode 100644 index 0000000000..810a0742b0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/job_submit.lua.tpl @@ -0,0 +1,103 @@ +SCRIPTS_DIR = "{scripts_dir}" +NO_VAL = 4294967294 +-- get_tpu_vmcount.py exit code +PART_INVALID = -1 -- partition does not exists in config.yaml, thus do not exist in slurm +DIFF_VMCOUNTS_SAME_PART = -2 -- in the same partition there are nodesets with different vmcounts +DIFF_PART_DIFFERENT_VMCOUNTS = -3 -- partition is a list of partitions in which at least two of them have different vmcount +UNKWOWN_ERROR = -4 -- get_tpu_vmcount.py did not return a valid response + +function get_part(job_desc, part_list) + if job_desc.partition then + return job_desc.partition + end + for name, val in pairs(part_list) do + if val.flag_default == 1 then + return name + end + end + return nil +end + +function os.capture(cmd, raw) + local handle = assert(io.popen(cmd, 'r')) + local output = assert(handle:read('*a')) + handle:close() + return output +end + +function get_vmcount(part) + local cmd = SCRIPTS_DIR .. "/get_tpu_vmcount.py -p " .. part + local out = os.capture(cmd, true) + for line in out:gmatch("(.-)\r?\n") do + local tag, val = line:match("([^:]+):([^:]+)") + if tag == "VMCOUNT" then + return tonumber(val) + end + end + return UNKWOWN_ERROR +end + +function slurm_job_submit(job_desc, part_list, submit_uid) + local part = get_part(job_desc, part_list) + local vmcount = get_vmcount(part) + -- Only do something if the job is in a TPU partition, if vmcount is 0, it implies that the partition(s) specified are not TPU ones + if vmcount == 0 then + return slurm.SUCCESS + end + -- This is a TPU job, but as the vmcount is 1 it can he handled the same way + if vmcount == 1 then + return slurm.SUCCESS + end + -- Check for errors + if vmcount == PART_INVALID then + slurm.log_user("Invalid partition specified " .. part) + return slurm.FAILURE + end + if vmcount == DIFF_VMCOUNTS_SAME_PART then + slurm.log_user("In partition(s) " .. part .. + " there are more than one tpu nodeset vmcount, this should not happen.") + return slurm.ERROR + end + if vmcount == DIFF_PART_DIFFERENT_VMCOUNTS then + slurm.log_user("In partition list " .. part .. + " there are more than one TPU types, cannot determine which is the correct vmcount to use, please retry with only one partition.") + return slurm.FAILURE + end + if vmcount == UNKWOWN_ERROR then + slurm.log_user("Something went wrong while executing get_tpu_vmcount.py.") + return slurm.ERROR + end + -- This is surely a TPU node + if vmcount > 1 then + local min_nodes = job_desc.min_nodes + local max_nodes = job_desc.max_nodes + -- if not specified assume it is one, this should be improved taking into account the cpus, mem, and other factors + if min_nodes == NO_VAL then + min_nodes = 1 + max_nodes = 1 + end + -- as max_nodes can be higher than the nodes in the partition, we are not able to calculate with certainty the nodes that this job will have if this value is set to something + -- different than min_nodes + if min_nodes ~= max_nodes then + slurm.log_user("Max nodes cannot be set different than min nodes for the TPU partitions.") + return slurm.ERROR + end + -- Set the number of switches to the number of nodes originally requested by the job, as the job requests "TPU groups" + job_desc.req_switch = min_nodes + + -- Apply the node increase into the job description. + job_desc.min_nodes = min_nodes * vmcount + job_desc.max_nodes = max_nodes * vmcount + -- if job_desc.features then + -- slurm.log_user("Features: %s",job_desc.features) + -- end + end + + return slurm.SUCCESS +end + +function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) + return slurm.SUCCESS +end + +return slurm.SUCCESS diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 2c01b6b579..91026fc267 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -94,12 +94,6 @@ variable "cgroup_conf_tpl" { default = null } -variable "job_submit_lua_tpl" { - type = string - description = "Slurm job_submit.lua template file path." - default = null -} - variable "cloudsql_secret" { description = "Secret URI to cloudsql secret." type = string From 9f39ad96161490ecc12c271fc33d795948101f44 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 7 Oct 2024 11:59:32 -0500 Subject: [PATCH 035/102] Do not trigger label validation on draft pull requests --- .github/workflows/pr-label-validation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index df54a6e150..a4c23aa998 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -34,7 +34,7 @@ on: jobs: pr-label-validation: - if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' + if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' && github.event.pull_request.draft == false runs-on: ubuntu-latest permissions: pull-requests: read From 4fd6f8a3bb934fbd9bc23eb9f0c2c211e46aeebf Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 18:10:29 +0000 Subject: [PATCH 036/102] Clean up big query load. --- .../modules/slurm_files/scripts/load_bq.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 9967069212..7bb8dc440b 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -15,6 +15,7 @@ import argparse +import math import os import shelve import uuid @@ -23,11 +24,10 @@ from pathlib import Path from pprint import pprint +import util from google.api_core import exceptions, retry from google.cloud import bigquery as bq from google.cloud.bigquery import SchemaField - -import util from util import lookup, run SACCT = "sacct" @@ -35,7 +35,9 @@ DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp" timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE)) -BQ_MAX_ROW_LOAD_SIZE = 10000 +# The maximum request to insert_rows is 10MB, each sacct row is about 1200 KB or ~ 8000 rows. +# Set to 5000 for a little wiggle room. +BQ_ROW_BATCH_SIZE = 5000 # cluster_id_file = script.parent / 'cluster_uuid' # try: @@ -282,6 +284,26 @@ def bq_submit(jobs): print(f"successfully loaded {len(jobs)} jobs") +def batched_bq_submit( + client, table, jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE +): + """Submit sacct data in batches of size bq_row_batch_size + + Args: + jobs: A list of dictionaries of sacct accounting data. + submit_function: The method to submit the jobs to BigQuery with. Defaults to bq_submit. + bq_row_batch_size: The accounting data will be submitted to BigQuery in + batches of this size. + """ + num_batches = int(math.ceil(len(jobs) / bq_row_batch_size)) + print( + f"loading {num_batches} batches of BigQuery data in batches of size : {bq_row_batch_size}" + ) + for indx in range(0, len(jobs), bq_row_batch_size): + print(f"loading BigQuery data batch {indx} of {num_batches}") + submit_function(client, jobs[indx : indx + bq_row_batch_size]) + + def get_time_window(): if not timestamp_file.is_file(): timestamp_file.touch() @@ -321,16 +343,9 @@ def main(): # on failure, an exception will cause the timestamp not to be rewritten. So # it will try again next time. If some writes succeed, we don't currently # have a way to not submit duplicates next time. - print(f"loading BigQuery data in batches of size : {BQ_MAX_ROW_LOAD_SIZE}") - num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1 - print(f"Number of batches: {num_batches}") if jobs: - start_job_idx = 0 - end_job_idx = BQ_MAX_ROW_LOAD_SIZE - for _ in range(num_batches): - bq_submit(jobs[start_job_idx:end_job_idx]) - start_job_idx = end_job_idx - end_job_idx += BQ_MAX_ROW_LOAD_SIZE + batched_bq_submit(client, table, jobs) + write_timestamp(end) update_job_idx_cache(jobs, end) From b235b474744a56bc7b27a4495b0bc82a4294c392 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 18:12:08 +0000 Subject: [PATCH 037/102] Clean up big query load --- .../modules/slurm_files/scripts/load_bq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 7bb8dc440b..ba10905b40 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -285,7 +285,7 @@ def bq_submit(jobs): def batched_bq_submit( - client, table, jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE + jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE ): """Submit sacct data in batches of size bq_row_batch_size From c5c55048c307142bb11100af7d638217867cb146 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 18:50:26 +0000 Subject: [PATCH 038/102] Don't add a new method. --- .../modules/slurm_files/scripts/load_bq.py | 29 +++++-------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index ba10905b40..4540d99a4e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -284,26 +284,6 @@ def bq_submit(jobs): print(f"successfully loaded {len(jobs)} jobs") -def batched_bq_submit( - jobs, submit_function=bq_submit, bq_row_batch_size=BQ_ROW_BATCH_SIZE -): - """Submit sacct data in batches of size bq_row_batch_size - - Args: - jobs: A list of dictionaries of sacct accounting data. - submit_function: The method to submit the jobs to BigQuery with. Defaults to bq_submit. - bq_row_batch_size: The accounting data will be submitted to BigQuery in - batches of this size. - """ - num_batches = int(math.ceil(len(jobs) / bq_row_batch_size)) - print( - f"loading {num_batches} batches of BigQuery data in batches of size : {bq_row_batch_size}" - ) - for indx in range(0, len(jobs), bq_row_batch_size): - print(f"loading BigQuery data batch {indx} of {num_batches}") - submit_function(client, jobs[indx : indx + bq_row_batch_size]) - - def get_time_window(): if not timestamp_file.is_file(): timestamp_file.touch() @@ -344,8 +324,13 @@ def main(): # it will try again next time. If some writes succeed, we don't currently # have a way to not submit duplicates next time. if jobs: - batched_bq_submit(client, table, jobs) - + num_batches = math.ceil(len(jobs) / BQ_ROW_BATCH_SIZE) + print( + f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}" + ) + for indx in range(0, len(jobs), BQ_ROW_BATCH_SIZE): + print(f"loading BigQuery data batch {indx} of {num_batches}") + bq_submit(jobs[indx : indx + BQ_ROW_BATCH_SIZE]) write_timestamp(end) update_job_idx_cache(jobs, end) From 6699d000ce2d33893daa35484f1d8c07fbef44d6 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 18:55:56 +0000 Subject: [PATCH 039/102] Remove test --- .../slurm_files/scripts/tests/test_load_bq.py | 39 ------------------- 1 file changed, 39 deletions(-) delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py deleted file mode 100644 index ebe45008a0..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_load_bq.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2024 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - - -# test the chunking logic if not the BigQuery loads themselves -@pytest.mark.parametrize("num_jobs_to_load", (0, 11, 10001, 51131, 104321)) -def test_chunked_bq_load(num_jobs_to_load: int): - BQ_MAX_ROW_LOAD_SIZE = 10000 - jobs = [i + 1 for i in range(num_jobs_to_load)] - num_batches = (len(jobs) // BQ_MAX_ROW_LOAD_SIZE) + 1 - print(num_batches) - load_cache = [] - if jobs: - start_job_idx = 0 - end_job_idx = BQ_MAX_ROW_LOAD_SIZE - for _ in range(num_batches): - load_cache.append(jobs[start_job_idx:end_job_idx]) - start_job_idx = end_job_idx - end_job_idx += BQ_MAX_ROW_LOAD_SIZE - if jobs: - assert ( - sum([sum(x) for x in load_cache]) - == num_jobs_to_load * (num_jobs_to_load + 1) // 2 - ) - else: - assert sum([sum(x) for x in load_cache]) == 0 From 546bcd8c19ec5825ca9afcfb256c067a1da42187 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 18:57:00 +0000 Subject: [PATCH 040/102] Fix comment --- .../modules/slurm_files/scripts/load_bq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 4540d99a4e..f3e86bdf2f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -35,7 +35,7 @@ DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp" timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE)) -# The maximum request to insert_rows is 10MB, each sacct row is about 1200 KB or ~ 8000 rows. +# The maximum request to insert_rows is 10MB, each sacct row is about 1200 bytes or ~ 8000 rows. # Set to 5000 for a little wiggle room. BQ_ROW_BATCH_SIZE = 5000 From ad6abc120573a8ade23f76ba262a196251c90236 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 20:03:15 +0000 Subject: [PATCH 041/102] Use integer arithmetic for num batches --- .../modules/slurm_files/scripts/load_bq.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index f3e86bdf2f..0f14e06794 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -15,7 +15,6 @@ import argparse -import math import os import shelve import uuid @@ -324,7 +323,7 @@ def main(): # it will try again next time. If some writes succeed, we don't currently # have a way to not submit duplicates next time. if jobs: - num_batches = math.ceil(len(jobs) / BQ_ROW_BATCH_SIZE) + num_batches = (len(jobs) - 1) // BQ_ROW_BATCH_SIZE + 1 print( f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}" ) From e142952a6532df68e0719189eee95da59ac4ac23 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 20:36:11 +0000 Subject: [PATCH 042/102] Accurate logging --- .../modules/slurm_files/scripts/load_bq.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 0f14e06794..8a6c59eaf2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -327,9 +327,11 @@ def main(): print( f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}" ) - for indx in range(0, len(jobs), BQ_ROW_BATCH_SIZE): - print(f"loading BigQuery data batch {indx} of {num_batches}") - bq_submit(jobs[indx : indx + BQ_ROW_BATCH_SIZE]) + for batch_indx, job_indx in enumerate(range(0, len(jobs), BQ_ROW_BATCH_SIZE)): + print( + f"loading BigQuery data batch {batch_indx} of {num_batches}. Loading rows {job_indx} to {job_indx + BQ_ROW_BATCH_SIZE}" + ) + bq_submit(jobs[job_indx : job_indx + BQ_ROW_BATCH_SIZE]) write_timestamp(end) update_job_idx_cache(jobs, end) From 69db3d6f8ae6a1b470b4dc11902d00899b36cb37 Mon Sep 17 00:00:00 2001 From: Fionn Malone Date: Mon, 7 Oct 2024 20:37:55 +0000 Subject: [PATCH 043/102] Shorten log message --- .../modules/slurm_files/scripts/load_bq.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 8a6c59eaf2..77df35748f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -328,9 +328,7 @@ def main(): f"loading {num_batches} batches of BigQuery data in batches of size : {BQ_ROW_BATCH_SIZE}" ) for batch_indx, job_indx in enumerate(range(0, len(jobs), BQ_ROW_BATCH_SIZE)): - print( - f"loading BigQuery data batch {batch_indx} of {num_batches}. Loading rows {job_indx} to {job_indx + BQ_ROW_BATCH_SIZE}" - ) + print(f"loading BigQuery data batch {batch_indx} of {num_batches}") bq_submit(jobs[job_indx : job_indx + BQ_ROW_BATCH_SIZE]) write_timestamp(end) update_job_idx_cache(jobs, end) From 28daa5ddc8752d1abef9eda7daf520066c0e090f Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Tue, 8 Oct 2024 00:32:55 +0000 Subject: [PATCH 044/102] add GKE support for parallelstore through gke-storage module --- examples/gke-storage-parallelstore.yaml | 105 ++++++++++++++ modules/file-system/gke-storage/README.md | 129 +++++++++++++++++ modules/file-system/gke-storage/main.tf | 78 ++++++++++ modules/file-system/gke-storage/metadata.yaml | 18 +++ modules/file-system/gke-storage/outputs.tf | 27 ++++ .../parallelstore-pvc.yaml.tftpl | 15 ++ .../storage-class/parallelstore-sc.yaml.tftpl | 21 +++ modules/file-system/gke-storage/variables.tf | 134 ++++++++++++++++++ modules/file-system/gke-storage/versions.tf | 21 +++ modules/scheduler/gke-cluster/README.md | 20 +-- modules/scheduler/gke-cluster/main.tf | 11 ++ modules/scheduler/gke-cluster/variables.tf | 6 + modules/scheduler/gke-cluster/versions.tf | 4 + .../test-gke-storage-parallelstore.yml | 41 ++++++ .../builds/gke-storage-parallelstore.yaml | 60 ++++++++ .../tests/gke-storage-parallelstore.yml | 28 ++++ 16 files changed, 710 insertions(+), 8 deletions(-) create mode 100644 examples/gke-storage-parallelstore.yaml create mode 100644 modules/file-system/gke-storage/README.md create mode 100644 modules/file-system/gke-storage/main.tf create mode 100644 modules/file-system/gke-storage/metadata.yaml create mode 100644 modules/file-system/gke-storage/outputs.tf create mode 100644 modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl create mode 100644 modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl create mode 100644 modules/file-system/gke-storage/variables.tf create mode 100644 modules/file-system/gke-storage/versions.tf create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml create mode 100644 tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml create mode 100644 tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml new file mode 100644 index 0000000000..daecc6657e --- /dev/null +++ b/examples/gke-storage-parallelstore.yaml @@ -0,0 +1,105 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +blueprint_name: gke-storage-parallelstore +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-storage-parallelstore + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet-parallelstore + secondary_ranges: + gke-subnet-parallelstore: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: private_service_access # required for parallelstore + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network] + settings: + enable_parallelstore_csi: true # enable Parallelstore for the cluster + configure_workload_identity_sa: true + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + outputs: [instructions] + + ### Set up storage class and persistent volume claim for Parallelstore ### + - id: parallelstore-setup + source: modules/file-system/gke-storage + use: [gke_cluster, private_service_access] + settings: + storage_type: Parallelstore + access_mode: ReadWriteMany + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after + sc_topology_zones: [$(vars.zone)] + pvc_count: 2 + capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB + + - id: sample-pool + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: sample-pool + zones: [$(vars.zone)] + machine_type: n2-standard-4 + + ### Parallelstore enabled Job ### + + - id: parallelstore-job + source: modules/compute/gke-job-template + use: + - gke_cluster + - parallelstore-setup + settings: + image: busybox + command: + - bin/sh + - -c + - | + echo "Set up job folders" + shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} + mkdir /data/parallelstore-pvc-0/${JOB}/ -p; + mkdir /data/parallelstore-pvc-1/${JOB}/ -p; + + echo "Writing seed data to Parallelstore volumes" + dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 + dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 + + # echo "Hash file and write between the 2 hyerpdisk balanced volumes" + # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5 + # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5 + node_count: 5 + outputs: [instructions] diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md new file mode 100644 index 0000000000..7fbef919a4 --- /dev/null +++ b/modules/file-system/gke-storage/README.md @@ -0,0 +1,129 @@ +## Description + +This module creates Kubernetes Storage Class (SC) that can be used by a Persistent Volume Claim (PVC) +to dynamically provision GCP storage resources like Parallelstore. + +### Example + +The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Peresistent Volume Claim, +then use them in a `gke-job-template` to dynamically provision the resource. + +```yaml + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network] + settings: + enable_parallelstore_csi: true + + - id: private_service_access + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 + + - id: gke_storage + source: modules/file-system/gke-storage + use: [ gke_cluster, private_service_access ] + settings: + storage_type: Parallelstore + access_mode: ReadWriteMany + sc_volume_binding_mode: Immediate + sc_reclaim_policy: Delete + sc_topology_zones: [$(vars.zone)] + pvc_count: 2 + capacity_gb: 12000 + + - id: job_template + source: modules/compute/gke-job-template + use: [gke_storage, compute_pool] +``` + +See example +[gke-storage-parallelstore.yaml](../../../examples/README.md#gke-storage-parallelstoreyaml--) blueprint +for a complete example. + +### Authorized Network + +Since the `gke-storage` module is making calls to the Kubernetes API +to create Kubernetes entities, the machine performing the deployment must be +authorized to connect to the Kubernetes API. You can add the +`master_authorized_networks` settings block, as shown in the example above, with +the IP address of the machine performing the deployment. This will ensure that +the deploying machine can connect to the cluster. + +### Connecting Via Use + +The diagram below shows the valid `use` relationships for the GKE Cluster Toolkit +modules. For example the `gke-storage` module can `use` a +`gke-cluster` module and a `private_service_access` module, as shown in the example above. + +```mermaid +graph TD; + vpc-->|OneToMany|gke-cluster; + gke-cluster-->|OneToMany|gke-node-pool; + gke-node-pool-->|ManyToMany|gke-job-template; + gke-cluster-->|OneToMany|gke-storage; + gke-storage-->|ManyToMany|gke-job-template; +``` + +## License + + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)
Valid access modes:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
- ReadWriteOncePod | `string` | n/a | yes | +| [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes | +| [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes | +| [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | +| [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no | +| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no | +| [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | +| [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no | +| [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no | +| [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
Supported value:
- Retain
- Delete | `string` | n/a | yes | +| [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no | +| [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
Supported value:
- Immediate
- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no | +| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore
- Hyperdisk-balanced
- Hyperdisk-throughput
- Hyperdisk-extreme | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [persistent\_volume\_claims](#output\_persistent\_volume\_claims) | An object that describes a k8s PVC created by this module. | + diff --git a/modules/file-system/gke-storage/main.tf b/modules/file-system/gke-storage/main.tf new file mode 100644 index 0000000000..18f85fa779 --- /dev/null +++ b/modules/file-system/gke-storage/main.tf @@ -0,0 +1,78 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "gke-storage", ghpc_role = "file-system" }) +} + +locals { + storage_type = lower(var.storage_type) + storage_class_name = "${local.storage_type}-sc" + pvc_name_prefix = "${local.storage_type}-pvc" +} + +check "private_vpc_connection_peering" { + assert { + condition = lower(var.storage_type) != "parallelstore" ? true : var.private_vpc_connection_peering != null + error_message = <<-EOT + Parallelstore must be run within the same VPC as the GKE cluster and have private services access enabled. + If using new VPC, please use community/modules/network/private-service-access to create private-service-access. + If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). + EOT + } +} + +module "kubectl_apply" { + source = "../../management/kubectl-apply" + + cluster_id = var.cluster_id + project_id = var.project_id + + # count = var.pvc_count + apply_manifests = flatten( + [ + # create StorageClass in the cluster + { + content = templatefile( + "${path.module}/storage-class/${local.storage_class_name}.yaml.tftpl", + { + name = local.storage_class_name + labels = local.labels + volume_binding_mode = var.sc_volume_binding_mode + reclaim_policy = var.sc_reclaim_policy + topology_zones = var.sc_topology_zones + }) + }, + # create PersistentVolumeClaim in the cluster + flatten([ + for idx in range(var.pvc_count) : [ + { + content = templatefile( + "${path.module}/persistent-volume-claim/${(local.pvc_name_prefix)}.yaml.tftpl", + { + pvc_name = "${local.pvc_name_prefix}-${idx}" + labels = local.labels + capacity = "${var.capacity_gb}Gi" + access_mode = var.access_mode + storage_class_name = local.storage_class_name + } + ) + } + ] + ]) + ]) +} diff --git a/modules/file-system/gke-storage/metadata.yaml b/modules/file-system/gke-storage/metadata.yaml new file mode 100644 index 0000000000..8722823274 --- /dev/null +++ b/modules/file-system/gke-storage/metadata.yaml @@ -0,0 +1,18 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: [] diff --git a/modules/file-system/gke-storage/outputs.tf b/modules/file-system/gke-storage/outputs.tf new file mode 100644 index 0000000000..b789674814 --- /dev/null +++ b/modules/file-system/gke-storage/outputs.tf @@ -0,0 +1,27 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "persistent_volume_claims" { + description = "An object that describes a k8s PVC created by this module." + value = flatten([ + for idx in range(var.pvc_count) : [{ + name = "${local.pvc_name_prefix}-${idx}" + mount_path = "${var.pv_mount_path}/${local.pvc_name_prefix}-${idx}" + mount_options = var.mount_options + is_gcs = false + }] + ]) +} diff --git a/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl new file mode 100644 index 0000000000..32781be2fb --- /dev/null +++ b/modules/file-system/gke-storage/persistent-volume-claim/parallelstore-pvc.yaml.tftpl @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ${pvc_name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +spec: + accessModes: + - ${access_mode} + resources: + requests: + storage: ${capacity} + storageClassName: ${storage_class_name} diff --git a/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl new file mode 100644 index 0000000000..e6b8ea8d3e --- /dev/null +++ b/modules/file-system/gke-storage/storage-class/parallelstore-sc.yaml.tftpl @@ -0,0 +1,21 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${name} + labels: + %{~ for key, val in labels ~} + ${key}: ${val} + %{~ endfor ~} +provisioner: parallelstore.csi.storage.gke.io +parameters: +volumeBindingMode: ${volume_binding_mode} +reclaimPolicy: ${reclaim_policy} + %{~ if topology_zones != null ~} +allowedTopologies: +- matchLabelExpressions: + - key: topology.gke.io/zone + values: + %{~ for z in topology_zones ~} + - ${z} + %{~ endfor ~} + %{~ endif ~} diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf new file mode 100644 index 0000000000..97ff1af21b --- /dev/null +++ b/modules/file-system/gke-storage/variables.tf @@ -0,0 +1,134 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "project_id" { + description = "The project ID to host the cluster in." + type = string +} + +variable "cluster_id" { + description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`" + type = string +} + +variable "labels" { + description = "GCE resource labels to be applied to resources. Key-value pairs." + type = map(string) +} + +variable "storage_type" { + description = <<-EOT + The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview) + to used. This module currently support dynamic provisioning for the below storage options + - Parallelstore + - Hyperdisk-balanced + - Hyperdisk-throughput + - Hyperdisk-extreme + EOT + type = string + nullable = false + validation { + condition = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type)) + error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"." + } +} + +variable "access_mode" { + description = <<-EOT + The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) + Valid access modes: + - ReadWriteOnce + - ReadOnlyMany + - ReadWriteMany + - ReadWriteOncePod + EOT + type = string + nullable = false + validation { + condition = var.access_mode == null ? false : contains(["readwriteonce", "readonlymany", "readwritemany", "readwriteoncepod"], lower(var.access_mode)) + error_message = "Allowed string values for var.access_mode are \"ReadWriteOnce\", \"ReadOnlyMany\", \"ReadWriteMany\", \"ReadWriteOncePod\"." + } +} + +variable "sc_volume_binding_mode" { + description = <<-EOT + Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound. + Supported value: + - Immediate + - WaitForFirstConsumer + EOT + type = string + default = "WaitForFirstConsumer" + validation { + condition = var.sc_volume_binding_mode == null ? true : contains(["immediate", "waitforfirstconsumer"], lower(var.sc_volume_binding_mode)) + error_message = "Allowed string values for var.sc_volume_binding_mode are \"Immediate\", \"WaitForFirstConsumer\"." + } +} + +variable "sc_reclaim_policy" { + description = <<-EOT + Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted. + [More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming) + Supported value: + - Retain + - Delete + EOT + type = string + nullable = false + validation { + condition = var.sc_reclaim_policy == null ? true : contains(["retain", "delete"], lower(var.sc_reclaim_policy)) + error_message = "Allowed string values for var.sc_reclaim_policy are \"Retain\", \"Delete\"." + } +} + +variable "sc_topology_zones" { + description = "Zone location that allow the volumes to be dynamically provisioned." + type = list(string) + default = null +} + +variable "pvc_count" { + description = "How many PersistentVolumeClaims that will be created" + type = number + default = 1 +} + +variable "pv_mount_path" { + description = "Path within the container at which the volume should be mounted. Must not contain ':'." + type = string + default = "/data" +} + +variable "mount_options" { + description = "Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class." + type = string + default = null +} + +variable "capacity_gb" { + description = "The storage capacity with which to create the persistent volume." + type = number +} + +variable "private_vpc_connection_peering" { + description = <<-EOT + The name of the VPC Network peering connection. + If using new VPC, please use community/modules/network/private-service-access to create private-service-access and + If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). + EOT + type = string + default = null +} diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf new file mode 100644 index 0000000000..0a1082c515 --- /dev/null +++ b/modules/file-system/gke-storage/versions.tf @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.0" + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.39.0" + } +} diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 583af203da..554517f8d4 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -110,6 +110,7 @@ limitations under the License. | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | | [kubernetes](#requirement\_kubernetes) | ~> 2.23 | +| [null](#requirement\_null) | ~> 3.0 | ## Providers @@ -117,6 +118,7 @@ limitations under the License. |------|---------| | [google](#provider\_google) | > 5.0 | | [google-beta](#provider\_google-beta) | > 5.0 | +| [null](#provider\_null) | ~> 3.0 | ## Modules @@ -137,6 +139,7 @@ limitations under the License. | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [null_resource.enable_parallelstore_csi](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | @@ -144,7 +147,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | @@ -154,15 +157,16 @@ limitations under the License. | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | | [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional\_networks are passed in. | `bool` | `null` | no | +| [enable\_parallelstore\_csi](#input\_enable\_parallelstore\_csi) | The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. | `bool` | `false` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | | [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no | | [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | +| [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | | [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no | -| [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. |
list(object({
cidr_block = string
display_name = string
}))
| `[]` | no | +| [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. |
list(object({
cidr_block = string
display_name = string
}))
| `[]` | no | | [master\_ipv4\_cidr\_block](#input\_master\_ipv4\_cidr\_block) | (Beta) The IP range in CIDR notation to use for the hosted master network. | `string` | `"172.16.0.32/28"` | no | | [min\_master\_version](#input\_min\_master\_version) | The minimum version of the master. If unset, the cluster's version will be set by GKE to the version of the most recent official release. | `string` | `null` | no | | [name\_suffix](#input\_name\_suffix) | Custom cluster name postpended to the `deployment_name`. See `prefix_with_deployment_name`. | `string` | `""` | no | @@ -172,19 +176,19 @@ limitations under the License. | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [region](#input\_region) | The region to host the cluster in. | `string` | n/a | yes | | [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. | `string` | `"UNSPECIFIED"` | no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the system node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes | | [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no | | [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | -| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | -| [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | -| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | +| [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | +| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 480d5b7d58..698beea442 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -267,6 +267,17 @@ resource "google_container_node_pool" "system_node_pools" { } } +### TODO: remove this after Terraform support for GKE Parallelstore CSI is added. ### +### Instead use addons_config above to enable the CSI ### +resource "null_resource" "enable_parallelstore_csi" { + count = var.enable_parallelstore_csi == true ? 1 : 0 + + provisioner "local-exec" { + command = "gcloud container clusters update ${local.name} --location=${var.region} --project=${var.project_id} --update-addons=ParallelstoreCsiDriver=ENABLED" + } + depends_on = [google_container_node_pool.system_node_pools] # avoid cluster operation conflict +} + # For container logs to show up under Cloud Logging and GKE metrics to show up # on Cloud Monitoring console, some project level roles are needed for the # node_service_account diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index e91be6b297..a291d58a1a 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -127,6 +127,12 @@ variable "enable_persistent_disk_csi" { default = true } +variable "enable_parallelstore_csi" { + description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes." + type = bool + default = false +} + variable "system_node_pool_enabled" { description = "Create a system node pool." type = bool diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index d2fe8dd057..ad17fe1c43 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -28,6 +28,10 @@ terraform { source = "hashicorp/kubernetes" version = "~> 2.23" } + null = { + source = "hashicorp/null" + version = "~> 3.0" + } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.40.0" diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml new file mode 100644 index 0000000000..424908f436 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Get cluster credentials for kubectl + delegate_to: localhost + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} + +- name: Execute the job + delegate_to: localhost + ansible.builtin.shell: | + jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*) + for job in "${jobs[@]}"; do + kubectl create -f "$job" + done + args: + executable: /bin/bash + changed_when: False + +- name: Wait for job to complete + delegate_to: localhost + ansible.builtin.command: | + kubectl get job --field-selector status.successful=5 + register: job_completion + until: job_completion.stdout_lines | length > 1 + retries: 40 + delay: 15 + +- name: Print job_completion debug output + ansible.builtin.debug: + var: job_completion.stdout_lines diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml new file mode 100644 index 0000000000..1a6a5873cf --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml @@ -0,0 +1,60 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-job-template +- m.gke-node-pool +- m.gke-storage +- m.private-service-access +- m.vpc +- gke + +timeout: 14400s # 4hr + +steps: +## Test GKE +- id: gke-storage-parallelstore + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + SG_EXAMPLE=examples/gke-storage-parallelstore.yaml + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${SG_EXAMPLE} + echo ' source: modules/compute/vm-instance' >> $${SG_EXAMPLE} + echo ' use: [network]' >> $${SG_EXAMPLE} + echo ' settings:' >> $${SG_EXAMPLE} + echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} + echo ' zone: us-central1-a' >> $${SG_EXAMPLE} + + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + + IP=$(curl ifconfig.me) + sed -i "s//$${IP}/" $${SG_EXAMPLE} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml new file mode 100644 index 0000000000..6a43c01ab3 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +test_name: gke-storage-parallelstore +deployment_name: gke-storage-parallelstore-{{ build }} +zone: us-central1-a # for remote node +region: us-central1 +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-0" +post_deploy_tests: +- test-validation/test-gke-storage-parallelstore.yml +custom_vars: + project: "{{ project }}" +cli_deployment_vars: + region: "{{ region }}" From 63a68c05aabdbdaca4c12a4021e8abb4cc7540d1 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Tue, 8 Oct 2024 00:54:33 +0000 Subject: [PATCH 045/102] undo nccl test instruction to clean up the branch --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/outputs.tf | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 597f28cfe0..9f86002f8c 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -295,7 +295,7 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 58216e957f..8be6a2772a 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -80,14 +80,6 @@ locals { You can use the following commands to submit the sample job: kubectl create -f ${abspath(local.gpu_direct_setting.updated_workload_path)} - After submitting the sample job, you can validate the GPU performance by initiating NCCL test included in the sample workload: - NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods: - export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1) - export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}') - kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS - Depends on the Msg size used for transmission in the test, the busbw would different a bit. - For a3-highgpu machines, the expected busbw for MsgSize of 8G data should be around 80 GB/s - For a3-megagpu machines, the expected busbw for MsgSize of 8G data should be around 160 GB/s If you would like to enable GPUDirect for your own workload, please follow the below steps: export WORKLOAD_PATH=<> From 061ce66d7339f8694b01f08703d4c94cef6f334c Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 7 Oct 2024 12:43:27 -0700 Subject: [PATCH 046/102] Retry `wait-for-startup` script on internal error --- .../scripts/wait-for-startup-status.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh b/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh index 7b957bf66b..4a231f7def 100755 --- a/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh +++ b/community/modules/scripts/wait-for-startup/scripts/wait-for-startup-status.sh @@ -60,14 +60,29 @@ FINISH_LINE="startup-script exit status" # Match string for failures on the new guest agent FINISH_LINE_ERR="Script.*failed with error:" +NON_FATAL_ERRORS=( + "Internal error" +) + until [[ now -gt deadline ]]; do ser_log=$( set -o pipefail ${fetch_cmd} 2>"${error_file}" | c1grep "${FINISH_LINE}\|${FINISH_LINE_ERR}" ) || { - cat "${error_file}" - exit 1 + err=$(cat "${error_file}") + echo "$err" + fatal_error="true" + for e in "${NON_FATAL_ERRORS[@]}"; do + if [[ $err = *"$e"* ]]; then + fatal_error="false" + break + fi + done + + if [[ $fatal_error = "true" ]]; then + exit 1 + fi } if [[ -n "${ser_log}" ]]; then break; fi echo "Could not detect end of startup script. Sleeping." From 1e177828603138773f2ac0625ee0fe5fec576552 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Tue, 8 Oct 2024 02:32:49 +0000 Subject: [PATCH 047/102] upgrade local terraform-doc version --- .../gke-persistent-volume/variables.tf | 2 +- modules/file-system/gke-storage/README.md | 10 +++++----- modules/file-system/gke-storage/variables.tf | 2 +- modules/scheduler/gke-cluster/README.md | 16 ++++++++-------- modules/scheduler/gke-cluster/variables.tf | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf index a72fa3857f..80e21d0b8f 100644 --- a/modules/file-system/gke-persistent-volume/variables.tf +++ b/modules/file-system/gke-persistent-volume/variables.tf @@ -57,6 +57,6 @@ variable "capacity_gb" { } variable "labels" { - description = "GCE resource labels to be applied to resources. Key-value pairs." + description = "GCE resource labels to be applied to resources. Key-value pairs. " type = map(string) } diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index 7fbef919a4..1a63731e4c 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -107,19 +107,19 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)
Valid access modes:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
- ReadWriteOncePod | `string` | n/a | yes | +| [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)
Valid access modes:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
- ReadWriteOncePod | `string` | n/a | yes | | [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes | | [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no | -| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no | +| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection .
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no | | [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no | -| [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
Supported value:
- Retain
- Delete | `string` | n/a | yes | +| [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
Supported value:
- Retain
- Delete | `string` | n/a | yes | | [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no | -| [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
Supported value:
- Immediate
- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no | -| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore
- Hyperdisk-balanced
- Hyperdisk-throughput
- Hyperdisk-extreme | `string` | n/a | yes | +| [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
Supported value:
- Immediate
- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no | +| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore
- Hyperdisk-balanced
- Hyperdisk-throughput
- Hyperdisk-extreme | `string` | n/a | yes | ## Outputs diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf index 97ff1af21b..3fd672699f 100644 --- a/modules/file-system/gke-storage/variables.tf +++ b/modules/file-system/gke-storage/variables.tf @@ -125,7 +125,7 @@ variable "capacity_gb" { variable "private_vpc_connection_peering" { description = <<-EOT - The name of the VPC Network peering connection. + The name of the VPC Network peering connection . If using new VPC, please use community/modules/network/private-service-access to create private-service-access and If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). EOT diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 554517f8d4..78974d091d 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -147,7 +147,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | @@ -164,9 +164,9 @@ limitations under the License. | [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no | | [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | +| [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | | [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no | -| [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. |
list(object({
cidr_block = string
display_name = string
}))
| `[]` | no | +| [master\_authorized\_networks](#input\_master\_authorized\_networks) | External network that can access Kubernetes master through HTTPS. Must be specified in CIDR notation. |
list(object({
cidr_block = string
display_name = string
}))
| `[]` | no | | [master\_ipv4\_cidr\_block](#input\_master\_ipv4\_cidr\_block) | (Beta) The IP range in CIDR notation to use for the hosted master network. | `string` | `"172.16.0.32/28"` | no | | [min\_master\_version](#input\_min\_master\_version) | The minimum version of the master. If unset, the cluster's version will be set by GKE to the version of the most recent official release. | `string` | `null` | no | | [name\_suffix](#input\_name\_suffix) | Custom cluster name postpended to the `deployment_name`. See `prefix_with_deployment_name`. | `string` | `""` | no | @@ -176,19 +176,19 @@ limitations under the License. | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [region](#input\_region) | The region to host the cluster in. | `string` | n/a | yes | | [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. | `string` | `"UNSPECIFIED"` | no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the system node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes | | [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no | | [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | -| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | -| [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | -| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | +| [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | +| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index a291d58a1a..4c2e049d46 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -128,7 +128,7 @@ variable "enable_persistent_disk_csi" { } variable "enable_parallelstore_csi" { - description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes." + description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. " type = bool default = false } From 35ea25423bfa7bda27583d9c6df5a37b48b24ec1 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Tue, 8 Oct 2024 17:02:55 +0000 Subject: [PATCH 048/102] Added warning for v5 Slurm deployments --- cmd/create.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cmd/create.go b/cmd/create.go index c20fc5f121..17ec0eb442 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -125,9 +125,27 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) { // Expand the blueprint checkErr(bp.Expand(), ctx) validateMaybeDie(bp, *ctx) + v5DeprecationWarning(bp) + return bp, ctx } +// TODO: Remove this warning when v5 deprecation is complete +func v5DeprecationWarning(bp config.Blueprint) { + alreadyContainsV5 := false + bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) { + if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 { + logging.Info(boldYellow( + "We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" + + "Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" + + "However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" + + "While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.", + )) + alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers + } + }) +} + // TODO: move to expand.go func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) { err := validators.Execute(bp) From 6d56d9a110c2f3a206f3547514bb78a401d53111 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 8 Oct 2024 20:22:48 +0000 Subject: [PATCH 049/102] SlurmGCP `6.8.0 -> 6.8.1` --- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 12 ++++++------ .../schedmd-slurm-gcp-v6-controller/controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 4 ++-- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 72f4fccb9f..4d790fe703 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 7e547c3d5f..3f0ee54af8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 30f002d68f..a9d801d8c7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -238,13 +238,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.0 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.1 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 0148323597..9b105d7f39 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" project_id = var.project_id region = var.region @@ -99,7 +99,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index d9cb38ff07..de97810316 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 9be62f82f7..849844808a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" for_each = local.nodeset_map project_id = var.project_id @@ -101,7 +101,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.1" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index c50454739e..b817972331 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -94,7 +94,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 6ba58f0308..67f33cde7d 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml From 0259f4b9fa2213a16d4299ef87df115d785fefd1 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Tue, 1 Oct 2024 20:52:49 +0000 Subject: [PATCH 050/102] add validation for multi-host tpu --- .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 6 +++--- .../schedmd-slurm-gcp-v6-nodeset-tpu/main.tf | 7 +++++++ .../outputs.tf | 18 +++++++++++++++++- .../variables.tf | 15 ++++++++++++--- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index fac8a63d44..8db3950334 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -63,9 +63,9 @@ No resources. | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `5` | no | -| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | -| [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | n/a | yes | +| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling worker nodes allowed in this partition.
For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).
See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. | `number` | `0` | no | +| [node\_count\_static](#input\_node\_count\_static) | Number of worker nodes to be statically created.
For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores).
See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. | `number` | `0` | no | +| [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | `""` | no | | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf index c4e7a08043..ac9b119702 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf @@ -49,4 +49,11 @@ locals { reserved = var.reserved network_storage = var.network_storage } + + node_type_core_count = var.node_type == "" ? 0 : tonumber(regex("-(.*)", var.node_type)[0]) + + accelerator_core_list = var.accelerator_config.topology == "" ? [0, 0] : regexall("\\d+", var.accelerator_config.topology) + accelerator_core_count = length(local.accelerator_core_list) > 2 ? (local.accelerator_core_list[0] * local.accelerator_core_list[1] * local.accelerator_core_list[2]) * 2 : (local.accelerator_core_list[0] * local.accelerator_core_list[1]) * 2 + + tpu_core_count = local.accelerator_core_count == 0 ? local.node_type_core_count : local.accelerator_core_count } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf index 280264f467..8cb7b8663e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf @@ -17,7 +17,23 @@ output "nodeset_tpu" { value = local.nodeset_tpu precondition { - condition = (var.node_type == null) != (var.accelerator_config == { topology : "", version : "" }) + condition = (var.node_type == "") != (var.accelerator_config == { topology : "", version : "" }) error_message = "Either a node_type or an accelerator_config must be provided." } + + precondition { + condition = ((local.tpu_core_count / 8) <= var.node_count_dynamic_max) || ((local.tpu_core_count / 8) <= var.node_count_static) + error_message = <<-EOD + When using TPUs there should be at least one node per every 8 cores. + Currently there are ${local.tpu_core_count} cores but only ${var.node_count_static} static nodes and ${var.node_count_dynamic_max} dynamic nodes. + EOD + } + + precondition { + condition = (var.node_count_dynamic_max % (local.tpu_core_count / 8) == 0) && (var.node_count_static % (local.tpu_core_count / 8) == 0) + error_message = <<-EOD + The number of worker nodes should be a multiple of ${local.tpu_core_count / 8}. + This is to ensure each node has a TPU machine for job scheduling. + EOD + } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 30e8d5c177..3302e0ea4c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -13,15 +13,23 @@ # limitations under the License. variable "node_count_static" { - description = "Number of nodes to be statically created." + description = <<-EOD + Number of worker nodes to be statically created. + For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores). + See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. + EOD type = number default = 0 } variable "node_count_dynamic_max" { - description = "Maximum number of auto-scaling nodes allowed in this partition." + description = <<-EOD + Maximum number of auto-scaling worker nodes allowed in this partition. + For larger TPU machines, there are multiple worker nodes required per machine (1 for every 8 cores). + See https://cloud.google.com/tpu/docs/v4#large-topologies, for more information about these machine types. + EOD type = number - default = 5 + default = 0 } variable "name" { @@ -51,6 +59,7 @@ variable "disable_public_ips" { # tflint-ignore: terraform_unused_declarations variable "node_type" { description = "Specify a node type to base the vm configuration upon it." type = string + default = "" } variable "accelerator_config" { From a1ddebd42171e97099f05e45faf6bd1326f4ae5d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 21:29:33 +0000 Subject: [PATCH 051/102] Bump django from 4.2.15 to 4.2.16 in /community/front-end/ofe Bumps [django](https://github.com/django/django) from 4.2.15 to 4.2.16. - [Commits](https://github.com/django/django/compare/4.2.15...4.2.16) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 26756d670c..a9a4a047b4 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.15 +Django==4.2.16 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.15.2 From 4bb7a2cbb22abf1d3d657e2af72ae005eddd29a3 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 9 Oct 2024 01:08:28 +0000 Subject: [PATCH 052/102] Support for template files as config added to kueue installation --- modules/management/kubectl-apply/README.md | 2 +- modules/management/kubectl-apply/main.tf | 7 ++++--- modules/management/kubectl-apply/variables.tf | 9 +++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md index bd91e424dc..e2fbe50b65 100644 --- a/modules/management/kubectl-apply/README.md +++ b/modules/management/kubectl-apply/README.md @@ -119,7 +119,7 @@ limitations under the License. | [apply\_manifests](#input\_apply\_manifests) | A list of manifests to apply to GKE cluster using kubectl. For more details see [kubectl module's inputs](kubectl/README.md). |
list(object({
content = optional(string, null)
source = optional(string, null)
template_vars = optional(map(any), null)
server_side_apply = optional(bool, false)
wait_for_rollout = optional(bool, true)
}))
| `[]` | no | | [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects//locations//clusters/. | `string` | n/a | yes | | [jobset](#input\_jobset) | Install [Jobset](https://github.com/kubernetes-sigs/jobset) which manages a group of K8s [jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) as a unit. |
object({
install = optional(bool, false)
version = optional(string, "v0.5.2")
})
| `{}` | no | -| [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. |
object({
install = optional(bool, false)
version = optional(string, "v0.8.1")
config_path = optional(string, null)
})
| `{}` | no | +| [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. |
object({
install = optional(bool, false)
version = optional(string, "v0.8.1")
config_path = optional(string, null)
config_template_vars = optional(map(any), null)
})
| `{}` | no | | [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes | ## Outputs diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf index dd68be57f6..5663e01580 100644 --- a/modules/management/kubectl-apply/main.tf +++ b/modules/management/kubectl-apply/main.tf @@ -77,9 +77,10 @@ module "install_jobset" { } module "configure_kueue" { - source = "./kubectl" - source_path = local.install_kueue ? try(var.kueue.config_path, "") : null - depends_on = [module.install_kueue] + source = "./kubectl" + source_path = local.install_kueue ? try(var.kueue.config_path, "") : null + template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null + depends_on = [module.install_kueue] server_side_apply = true wait_for_rollout = true diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf index e0dd6430f5..2e0a36603d 100644 --- a/modules/management/kubectl-apply/variables.tf +++ b/modules/management/kubectl-apply/variables.tf @@ -38,11 +38,12 @@ variable "apply_manifests" { } variable "kueue" { - description = "Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler." + description = "Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config_path to be applied right after kueue installation. If a template file provided, its variables can be set to config_template_vars." type = object({ - install = optional(bool, false) - version = optional(string, "v0.8.1") - config_path = optional(string, null) + install = optional(bool, false) + version = optional(string, "v0.8.1") + config_path = optional(string, null) + config_template_vars = optional(map(any), null) }) default = {} From 102d0c0216f7807da6dfdabfaddf098dd7cad3ce Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 9 Oct 2024 01:20:31 +0000 Subject: [PATCH 053/102] Fix static-check. --- pkg/config/config.go | 5 +---- pkg/config/errors.go | 1 + pkg/config/expression.go | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index ef8e8e2290..df2192291f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -36,10 +36,7 @@ import ( ) const ( - expectedVarFormat string = "$(vars.var_name) or $(module_id.output_name)" - expectedModFormat string = "$(module_id) or $(group_id.module_id)" - unexpectedConnectionKind string = "connectionKind must be useConnection or deploymentConnection" - maxHintDist int = 3 // Maximum Levenshtein distance where we suggest a hint + maxHintDist int = 3 // Maximum Levenshtein distance where we suggest a hint ) // map[moved module path]replacing module path diff --git a/pkg/config/errors.go b/pkg/config/errors.go index d415602f08..1dd976260d 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -157,6 +157,7 @@ var UnknownModuleSetting = errors.New("a setting was added that is not found in var ModuleSettingWithPeriod = errors.New("a setting name contains a period, which is not supported; variable subfields cannot be set independently in a blueprint.") var ModuleSettingInvalidChar = errors.New("a setting name must begin with a non-numeric character and all characters must be either letters, numbers, dashes ('-') or underscores ('_').") var EmptyGroupName = errors.New("group name must be set for each deployment group") +var UnexpectedRefFormat = errors.New("Expected reference formats: $(vars.var_name) or $(module_id.output_name)") // Error messages const ( diff --git a/pkg/config/expression.go b/pkg/config/expression.go index 68512ec344..3cfeb096d1 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -69,11 +69,11 @@ func (r Reference) String() string { // and transforms it to "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`). func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) { if len(t) < 2 { - return nil, fmt.Errorf(expectedVarFormat) + return nil, UnexpectedRefFormat } _, ok := t[1].(hcl.TraverseAttr) if !ok { - return nil, fmt.Errorf(expectedVarFormat) + return nil, UnexpectedRefFormat } if t.RootName() == "vars" { From fc4d0ed61f1dd992676ac1c898f827e48d2a9e17 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 9 Oct 2024 15:05:36 +0000 Subject: [PATCH 054/102] document updated --- modules/management/kubectl-apply/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/modules/management/kubectl-apply/README.md b/modules/management/kubectl-apply/README.md index e2fbe50b65..3e3bebecc0 100644 --- a/modules/management/kubectl-apply/README.md +++ b/modules/management/kubectl-apply/README.md @@ -58,6 +58,21 @@ This module simplifies the following functionality: install: true ``` +The `config_path` field in `kueue` installation accepts a template file, too. You will need to provide variables for the template using `config_template_vars` field. + +```yaml + - id: workload_component_install + source: modules/management/kubectl-apply + use: [gke_cluster] + settings: + kueue: + install: true + config_path: $(ghpc_stage("manifests/user-provided-kueue-config.yaml.tftpl")) + config_template_vars: {name: "dev-config", public: "false"} + jobset: + install: true +``` + > **_NOTE:_** > > The `project_id` and `region` settings would be inferred from the deployment variables of the same name, but they are included here for clarity. From 533e0808915807c90a42fcbc0c5eec56bc99fc2e Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 8 Oct 2024 19:30:22 +0000 Subject: [PATCH 055/102] upgrading tpg from 5.x to 6.x --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 3a8898306d..a58ce74a41 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.45.0", + Version: ">= 4.84.0, < 6.7.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.45.0", + Version: ">= 4.84.0, < 6.7.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 40fc192175..59495832d4 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.45.0"}, + Version: ">= 4.84.0, < 6.7.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.45.0"}}) + Version: ">= 4.84.0, < 6.7.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index 32d7d818a8..ba265ba2ee 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 6630b9b8c6..3534fd124e 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index 8a160967a2..5736fbba16 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -79,14 +79,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 6630b9b8c6..3534fd124e 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 6630b9b8c6..3534fd124e 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 9c97a650eb..c21a1bb32f 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 6630b9b8c6..3534fd124e 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 4e74f8d305..ad79aee614 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.45.0' + version: '>= 4.84.0, < 6.7.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 6630b9b8c6..3534fd124e 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.45.0" + version = ">= 4.84.0, < 6.7.0" } } } From 8a050d72e181f49ec70f6300662a01cf6401578e Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 12 Sep 2024 15:55:57 +0000 Subject: [PATCH 056/102] Initial commit for gke-a3megagpu integration test --- modules/compute/gke-node-pool/README.md | 22 ++-- .../compute/gke-node-pool/disk_definitions.tf | 4 +- .../gke-integration-test.yml | 118 ++++++++++++++++++ .../daily-tests/builds/gke-a3-megagpu.yaml | 66 ++++++++++ .../daily-tests/tests/gke-a3-megagpu.yml | 43 +++++++ 5 files changed, 240 insertions(+), 13 deletions(-) create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml create mode 100644 tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml create mode 100644 tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 03652cf29e..9f86002f8c 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -295,26 +295,26 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | -| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf index f7dbebea0a..b5933bf316 100644 --- a/modules/compute/gke-node-pool/disk_definitions.tf +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -22,8 +22,8 @@ locals { local_ssd_machines = { - "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, - "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, } generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml new file mode 100644 index 0000000000..ac3e5a3deb --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml @@ -0,0 +1,118 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: "Setup Integration tests for Cluster Toolkit" + hosts: localhost + tasks: + ## Create SSH Keys + - name: "Create .ssh folder" + ansible.builtin.file: + path: "/builder/home/.ssh" + state: directory + mode: 0700 + + - name: Create SSH Key + community.crypto.openssh_keypair: + path: "/builder/home/.ssh/id_rsa" + + ## Get builder IP address + - name: Get Builder IP + register: build_ip + changed_when: false + args: + executable: /bin/bash + ansible.builtin.shell: | + set -e -o pipefail + dig TXT +short o-o.myaddr.l.google.com @ns1.google.com | \ + awk -F'"' '{print $2}' + + ## Create cluster + - name: Create Deployment Directory + ansible.builtin.include_tasks: + file: tasks/create_deployment_directory.yml + + - name: Create Infrastructure and test + block: + - name: Create Cluster with gcluster + register: deployment + changed_when: deployment.changed + ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve + args: + chdir: "{{ workspace }}" + environment: + TF_IN_AUTOMATION: "TRUE" + + ## Cleanup and fail gracefully + rescue: + - name: Capture gcluster stderr + failed_when: false + ansible.builtin.set_fact: + gcluster_stderr: "{{ deployment.stderr | replace('\n',' ') }}" + + - name: Gather logs + ansible.builtin.include_tasks: + file: tasks/gather_startup_script_logs.yml + apply: + delegate_to: localhost + + - name: Include rescue from gcluster failure + ansible.builtin.include_tasks: + file: tasks/rescue_gcluster_failure.yml + apply: + delegate_to: localhost + vars: + deployment_name: "{{ deployment_name }}" + workspace: "{{ workspace }}" + + - name: Trigger failure (rescue blocks otherwise revert failures) + ansible.builtin.fail: + msg: "Failed while setting up test infrastructure" + +- name: Run Integration Tests + hosts: remote_host + vars: + startup_timeout_seconds: 600 # 10 minutes + gather_facts: false + tasks: + - name: Remote Test Block + vars: + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" + + block: + - name: Include wait for startup script + ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml" + vars: + timeout_seconds: "{{ startup_timeout_seconds }}" + + - name: Run Integration tests for Cluster Toolkit + ansible.builtin.include_tasks: "{{ test }}" + vars: + remote_node: "{{ remote_node }}" + deployment_name: "{{ deployment_name }}" + custom_vars: "{{ custom_vars }}" + loop: "{{ post_deploy_tests }}" + loop_control: + loop_var: test + + always: + - name: Cleanup firewall and infrastructure + ansible.builtin.include_tasks: + file: tasks/rescue_gcluster_failure.yml + apply: + delegate_to: localhost + vars: + deployment_name: "{{ deployment_name }}" + workspace: "{{ workspace }}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml new file mode 100644 index 0000000000..fc16863abd --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -0,0 +1,66 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-node-pool +- m.vpc +- m.multivpc +- m.kubectl-apply +- gke + +timeout: 14400s # 4hr +steps: +- id: gke-a3-megagpu + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + EXAMPLE_BP=examples/gke-a3-megagpu.yaml + + # Replacing the static subnet name to prevent collisions + sed -i "s/gke-subnet-a3-mega/gke-subnet-a3-mega-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${EXAMPLE_BP} + echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP} + echo ' use: [network1]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP} + echo ' name_prefix: remote-node' >> $${EXAMPLE_BP} + echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} + echo '' + echo ' - id: job_template_hostname' >> $${EXAMPLE_BP} + echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP} + echo ' use: [a3_megagpu_pool]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' name: job-a3mega-test' >> $${EXAMPLE_BP} + echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP} + echo ' command:' >> $${EXAMPLE_BP} + echo ' - nvidia-smi' >> $${EXAMPLE_BP} + echo ' node_count: 1' >> $${EXAMPLE_BP} + echo ' outputs: [instructions]' >> $${EXAMPLE_BP} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml new file mode 100644 index 0000000000..6b305c3410 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# region, zone must be defined +# in build file with --extra-vars flag! +test_name: gke-a3mega +deployment_name: gke-a3mega-{{ build }} +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/gke-a3-megagpu.yaml" +network: "gke-a3mega-net-{{ build }}" +region: us-west4 +zone: us-west4-a +remote_node: "{{ deployment_name }}-remote-node-0" +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: a3mega-reservation-0 + project: "{{ project }}" +cli_deployment_vars: + region: "{{ region }}" + zone: "{{ zone }}" + reservation_affinity: "{{ reservation_affinity }}" + autoscaling_total_max_nodes: 2 + authorized_cidr: "{{ build_ip.stdout }}/32" + network_name: "{{ network }}" + local_ssd_count_nvme_block: 16 +custom_vars: + project: "{{ project }}" +post_deploy_tests: +- test-validation/test-gke-job.yml From 93d7af0cddfc86ebe1c0b887f73b64e364259377 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Wed, 9 Oct 2024 20:05:51 +0000 Subject: [PATCH 057/102] Integration test now working --- modules/compute/gke-node-pool/README.md | 22 ++-- .../compute/gke-node-pool/disk_definitions.tf | 4 +- .../gke-integration-test.yml | 118 ------------------ .../test-validation/test-gke-job.yml | 4 +- .../daily-tests/builds/gke-a3-megagpu.yaml | 1 - 5 files changed, 15 insertions(+), 134 deletions(-) delete mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 9f86002f8c..03652cf29e 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -295,26 +295,26 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | -| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf index b5933bf316..f7dbebea0a 100644 --- a/modules/compute/gke-node-pool/disk_definitions.tf +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -22,8 +22,8 @@ locals { local_ssd_machines = { - "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, - "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, } generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml deleted file mode 100644 index ac3e5a3deb..0000000000 --- a/tools/cloud-build/daily-tests/ansible_playbooks/gke-integration-test.yml +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -- name: "Setup Integration tests for Cluster Toolkit" - hosts: localhost - tasks: - ## Create SSH Keys - - name: "Create .ssh folder" - ansible.builtin.file: - path: "/builder/home/.ssh" - state: directory - mode: 0700 - - - name: Create SSH Key - community.crypto.openssh_keypair: - path: "/builder/home/.ssh/id_rsa" - - ## Get builder IP address - - name: Get Builder IP - register: build_ip - changed_when: false - args: - executable: /bin/bash - ansible.builtin.shell: | - set -e -o pipefail - dig TXT +short o-o.myaddr.l.google.com @ns1.google.com | \ - awk -F'"' '{print $2}' - - ## Create cluster - - name: Create Deployment Directory - ansible.builtin.include_tasks: - file: tasks/create_deployment_directory.yml - - - name: Create Infrastructure and test - block: - - name: Create Cluster with gcluster - register: deployment - changed_when: deployment.changed - ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve - args: - chdir: "{{ workspace }}" - environment: - TF_IN_AUTOMATION: "TRUE" - - ## Cleanup and fail gracefully - rescue: - - name: Capture gcluster stderr - failed_when: false - ansible.builtin.set_fact: - gcluster_stderr: "{{ deployment.stderr | replace('\n',' ') }}" - - - name: Gather logs - ansible.builtin.include_tasks: - file: tasks/gather_startup_script_logs.yml - apply: - delegate_to: localhost - - - name: Include rescue from gcluster failure - ansible.builtin.include_tasks: - file: tasks/rescue_gcluster_failure.yml - apply: - delegate_to: localhost - vars: - deployment_name: "{{ deployment_name }}" - workspace: "{{ workspace }}" - - - name: Trigger failure (rescue blocks otherwise revert failures) - ansible.builtin.fail: - msg: "Failed while setting up test infrastructure" - -- name: Run Integration Tests - hosts: remote_host - vars: - startup_timeout_seconds: 600 # 10 minutes - gather_facts: false - tasks: - - name: Remote Test Block - vars: - ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" - - block: - - name: Include wait for startup script - ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml" - vars: - timeout_seconds: "{{ startup_timeout_seconds }}" - - - name: Run Integration tests for Cluster Toolkit - ansible.builtin.include_tasks: "{{ test }}" - vars: - remote_node: "{{ remote_node }}" - deployment_name: "{{ deployment_name }}" - custom_vars: "{{ custom_vars }}" - loop: "{{ post_deploy_tests }}" - loop_control: - loop_var: test - - always: - - name: Cleanup firewall and infrastructure - ansible.builtin.include_tasks: - file: tasks/rescue_gcluster_failure.yml - apply: - delegate_to: localhost - vars: - deployment_name: "{{ deployment_name }}" - workspace: "{{ workspace }}" diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml index f1be62e220..44be3ac853 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-job.yml @@ -15,12 +15,12 @@ - name: Assert variables are defined ansible.builtin.assert: that: - - cli_deployment_vars.region is defined + - region is defined - custom_vars.project is defined - name: Get cluster credentials for kubectl delegate_to: localhost - ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ region }} --project {{ custom_vars.project }} - name: Execute the job delegate_to: localhost diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index fc16863abd..118704e7ea 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -54,7 +54,6 @@ steps: echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP} echo ' use: [a3_megagpu_pool]' >> $${EXAMPLE_BP} echo ' settings:' >> $${EXAMPLE_BP} - echo ' name: job-a3mega-test' >> $${EXAMPLE_BP} echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP} echo ' command:' >> $${EXAMPLE_BP} echo ' - nvidia-smi' >> $${EXAMPLE_BP} From 5cb64acebcfb136ddbeba2b6919e2677f1aab806 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 9 Oct 2024 20:47:42 +0000 Subject: [PATCH 058/102] Free slurm-gcp v5 hybrid blueprints with the latest cluster toolkit version support --- community/examples/tutorial-starccm-slurm.yaml | 2 ++ docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index d9ad22d1a7..ce8dd0817f 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -15,6 +15,8 @@ --- blueprint_name: starccm-on-slurm +toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit +toolkit_modules_version: v1.40.0 vars: project_id: ## Set GCP Project ID Here ## diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index 45312348ed..0220352d35 100644 --- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -15,6 +15,8 @@ --- blueprint_name: hpc-cluster-hybrid-v5 +toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit +toolkit_modules_version: v1.40.0 vars: project_id: ## <> From 6e684c1eb9377a6b1d21bd01d8eaae9df1530dd5 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Wed, 9 Oct 2024 21:28:37 +0000 Subject: [PATCH 059/102] updating docs for v_blueprint feature --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index ec386515e6..0275fd930c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1772,7 +1772,7 @@ scratch. --- blueprint_name: # boilerplate-blueprint toolkit_modules_url: # github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: # v1.15.0 +toolkit_modules_version: # v1.38.0 vars: project_id: # my-project-id @@ -1796,7 +1796,7 @@ deployment_groups: 63 characters long, and can only contain lowercase letters, numeric characters, underscores and dashes. -* **toolkit_modules_url** and **toolkit_modules_version** (optional): The blueprint schema provides the optional fields `toolkit_modules_url` and `toolkit_modules_version` to version a blueprint. When these fields are provided, any module in the blueprint with a reference to an embedded module in its source field will be updated to reference the specified GitHub source and toolkit version in the expanded blueprint. `toolkit_modules_url` specifies the base URL of the GitHub repository containing the modules and `toolkit_modules_version` specifies the version of the modules to use. `toolkit_modules_url` and `toolkit_modules_version` should be provided together when in use. +* **toolkit_modules_url** and **toolkit_modules_version** (optional): The blueprint schema provides the optional fields `toolkit_modules_url` and `toolkit_modules_version` to version a blueprint. When these fields are provided, any module in the blueprint with a reference to an embedded module in its source field will be updated to reference the specified GitHub source and toolkit version in the deployment folder. `toolkit_modules_url` specifies the base URL of the GitHub repository containing the modules and `toolkit_modules_version` specifies the version of the modules to use. `toolkit_modules_url` and `toolkit_modules_version` should be provided together when in use. ### Deployment Variables From 0fed0411ee26426d977c9ef569c08fc70b48b5c1 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Thu, 10 Oct 2024 00:13:26 +0000 Subject: [PATCH 060/102] update version info --- modules/file-system/gke-persistent-volume/variables.tf | 2 +- modules/file-system/gke-storage/README.md | 2 +- modules/file-system/gke-storage/variables.tf | 2 +- modules/file-system/gke-storage/versions.tf | 2 +- modules/scheduler/gke-cluster/variables.tf | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf index 80e21d0b8f..a72fa3857f 100644 --- a/modules/file-system/gke-persistent-volume/variables.tf +++ b/modules/file-system/gke-persistent-volume/variables.tf @@ -57,6 +57,6 @@ variable "capacity_gb" { } variable "labels" { - description = "GCE resource labels to be applied to resources. Key-value pairs. " + description = "GCE resource labels to be applied to resources. Key-value pairs." type = map(string) } diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index 1a63731e4c..c578a4a0d8 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -112,7 +112,7 @@ No resources. | [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [mount\_options](#input\_mount\_options) | Controls the mountOptions for dynamically provisioned PersistentVolumes of this storage class. | `string` | `null` | no | -| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection .
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no | +| [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection.
If using new VPC, please use community/modules/network/private-service-access to create private-service-access and
If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). | `string` | `null` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [pv\_mount\_path](#input\_pv\_mount\_path) | Path within the container at which the volume should be mounted. Must not contain ':'. | `string` | `"/data"` | no | | [pvc\_count](#input\_pvc\_count) | How many PersistentVolumeClaims that will be created | `number` | `1` | no | diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf index 3fd672699f..97ff1af21b 100644 --- a/modules/file-system/gke-storage/variables.tf +++ b/modules/file-system/gke-storage/variables.tf @@ -125,7 +125,7 @@ variable "capacity_gb" { variable "private_vpc_connection_peering" { description = <<-EOT - The name of the VPC Network peering connection . + The name of the VPC Network peering connection. If using new VPC, please use community/modules/network/private-service-access to create private-service-access and If using existing VPC with private-service-access enabled, set this manually follow [user guide](https://cloud.google.com/parallelstore/docs/vpc). EOT diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index 0a1082c515..78d62b235d 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = ">= 1.0" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.39.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.40.0" } } diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 4c2e049d46..a291d58a1a 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -128,7 +128,7 @@ variable "enable_persistent_disk_csi" { } variable "enable_parallelstore_csi" { - description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes. " + description = "The status of the Google Compute Engine Parallelstore Container Storage Interface (CSI) driver addon, which allows the usage of a parallelstore as volumes." type = bool default = false } From 8e8656f64bb5f5cabb2fc5fbbb310178aa998475 Mon Sep 17 00:00:00 2001 From: ChengcongDu Date: Thu, 10 Oct 2024 21:57:23 +0000 Subject: [PATCH 061/102] upgrade local tf-doc version and redo doc gen --- modules/compute/gke-node-pool/outputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 7bcd0c6361..77088a7641 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -84,7 +84,7 @@ locals { NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods: export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1) export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}') - kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS + kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS If you would like to enable GPUDirect for your own workload, please follow the below steps: export WORKLOAD_PATH=<> From 756908116ce8536bc5e6547b1dff95ac2f044ddb Mon Sep 17 00:00:00 2001 From: ChengcongDu Date: Thu, 10 Oct 2024 21:58:04 +0000 Subject: [PATCH 062/102] upgrade local tf-doc version and redo doc gen --- modules/compute/gke-node-pool/outputs.tf | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 77088a7641..7bcd0c6361 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -84,7 +84,7 @@ locals { NCCL test can be initiated from any one of the sample job Pods and coordinate with the peer Pods: export POD_NAME=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}' | head -n 1) export PEER_POD_IPS=$(kubectl get pods -l job-name=my-sample-job -o go-template='{{range .items}}{{.status.podIP}}{{" "}}{{end}}') - kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS + kubectl exec --stdin --tty --container=nccl-test $POD_NAME -- /scripts/allgather.sh $PEER_POD_IPS If you would like to enable GPUDirect for your own workload, please follow the below steps: export WORKLOAD_PATH=<> diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index ef1277744f..825c1c72d0 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -15,7 +15,7 @@ */ variable "project_id" { - description = "The project ID to host the cluster in." + description = "The project ID to host the cluster in. " type = string } From 088a9a749508ea7d06c08b4c68fb6b32c444d8ac Mon Sep 17 00:00:00 2001 From: ChengcongDu Date: Thu, 10 Oct 2024 22:43:39 +0000 Subject: [PATCH 063/102] upgrade local tf-doc version and redo doc gen --- modules/compute/gke-node-pool/README.md | 22 +++++++++++----------- modules/compute/gke-node-pool/variables.tf | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 78c4dd1dd7..fcf7414af6 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -284,7 +284,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -294,26 +294,26 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | -| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | +| [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 825c1c72d0..ef1277744f 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -15,7 +15,7 @@ */ variable "project_id" { - description = "The project ID to host the cluster in. " + description = "The project ID to host the cluster in." type = string } From c09fe01595314cec275e8beec8bcfc11a542dcfc Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 8 Oct 2024 19:32:59 +0000 Subject: [PATCH 064/102] Implement xpk-gke-a3-megagpu blueprint --- .../config-map.yaml.tftpl | 6 + .../kueue-credentials.yaml.tftpl | 73 +++++++++++ community/examples/xpk-gke-a3-megagpu.yaml | 118 ++++++++++++++++++ .../manifests/schedule-daemon.yaml | 4 + .../{resource-policy => }/README.md | 0 .../{resource-policy => }/main.tf | 0 .../{resource-policy => }/metadata.yaml | 0 .../{resource-policy => }/outputs.tf | 0 .../{resource-policy => }/variables.tf | 0 .../{resource-policy => }/versions.tf | 0 10 files changed, 201 insertions(+) create mode 100644 community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl create mode 100644 community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl create mode 100644 community/examples/xpk-gke-a3-megagpu.yaml rename modules/compute/resource-policy/{resource-policy => }/README.md (100%) rename modules/compute/resource-policy/{resource-policy => }/main.tf (100%) rename modules/compute/resource-policy/{resource-policy => }/metadata.yaml (100%) rename modules/compute/resource-policy/{resource-policy => }/outputs.tf (100%) rename modules/compute/resource-policy/{resource-policy => }/variables.tf (100%) rename modules/compute/resource-policy/{resource-policy => }/versions.tf (100%) diff --git a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl new file mode 100644 index 0000000000..100058b7be --- /dev/null +++ b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl @@ -0,0 +1,6 @@ +kind: ConfigMap +apiVersion: v1 +metadata: + name: ${name} +data: + h100-mega-80gb-8: ${num_nodes} diff --git a/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl new file mode 100644 index 0000000000..326cea0e54 --- /dev/null +++ b/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl @@ -0,0 +1,73 @@ +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: 1xh100-mega-80gb-8 +spec: + nodeLabels: + cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb +--- + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: cluster-queue +spec: + preemption: + reclaimWithinCohort: Never # Don't preempt other queues in the cohort. + withinClusterQueue: LowerPriority + namespaceSelector: {} # match all. + resourceGroups: + - coveredResources: ["nvidia.com/gpu"] + flavors: + - name: 1xh100-mega-80gb-8 + resources: + - name: "nvidia.com/gpu" + nominalQuota: ${num_chips} +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: default + name: multislice-queue +spec: + clusterQueue: cluster-queue +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-low +value: 100 +globalDefault: false +description: "Very Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low +value: 250 +globalDefault: false +description: "Low" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: medium +value: 500 +globalDefault: false +description: "Medium" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: high +value: 750 +globalDefault: false +description: "High" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: very-high +value: 1000 +globalDefault: false +description: "Very High" diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml new file mode 100644 index 0000000000..999a52ea7b --- /dev/null +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -0,0 +1,118 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: xpk-gke-a3-megagpu + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: xpk-gke-a3-megagpu + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: xpk-gke-a3-megagpu-subnet + secondary_ranges: + xpk-gke-a3-megagpu-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 4 + subnetwork_cidr_suffix: 24 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + #min_master_version: "1.29.6-gke.1326000" # (TODO: Ask MaxText the reason to set this) Couldn't find this version in the valid master versions in australia-southeast1-c. Can be left unset to be set by GKE to the version of the most recent official release. + system_node_pool_machine_type: "e2-standard-32" + outputs: [instructions] + + - id: group_placement_0 + source: modules/compute/resource-policy + settings: + name: xpk-gke-a3-megagpu-gp-np-0 + group_placement_max_distance: 2 + + - id: group_placement_1 + source: modules/compute/resource-policy + settings: + name: xpk-gke-a3-megagpu-gp-np-0 + group_placement_max_distance: 2 + + - id: a3_megagpu_pool_0 + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets, group_placement_0] + settings: + machine_type: a3-megagpu-8g + autoscaling_total_min_nodes: 1 + initial_node_count: 1 + zones: [$(vars.zone)] + host_maintenance_interval: PERIODIC + outputs: [instructions] + + - id: a3_megagpu_pool_1 + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets, group_placement_1] + settings: + machine_type: a3-megagpu-8g + autoscaling_total_min_nodes: 1 + initial_node_count: 1 + zones: [$(vars.zone)] + host_maintenance_interval: PERIODIC + outputs: [instructions] + + - id: workload_manager_install + source: modules/management/kubectl-apply + use: [gke_cluster] + settings: + kueue: + install: true + jobset: + install: true + + - id: topology_aware_scheduler_install + source: community/modules/compute/gke-topology-scheduler + use: [gke_cluster] + + - id: workload_manager_config + source: modules/management/kubectl-apply + use: [gke_cluster] + settings: + apply_manifests: + - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl + template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"} + - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-credentials.yaml.tftpl + template_vars: {num_chips: "16"} diff --git a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml index b412f936e9..9c9a4ab929 100644 --- a/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml +++ b/community/modules/compute/gke-topology-scheduler/manifests/schedule-daemon.yaml @@ -33,6 +33,10 @@ spec: - key: "node-role.kubernetes.io/control-plane" operator: "Exists" effect: "NoSchedule" + - key: components.gke.io/gke-managed-components + value: "true" + operator: Equal + effect: NoSchedule containers: - name: topology-scheduler-container image: python:3.9 diff --git a/modules/compute/resource-policy/resource-policy/README.md b/modules/compute/resource-policy/README.md similarity index 100% rename from modules/compute/resource-policy/resource-policy/README.md rename to modules/compute/resource-policy/README.md diff --git a/modules/compute/resource-policy/resource-policy/main.tf b/modules/compute/resource-policy/main.tf similarity index 100% rename from modules/compute/resource-policy/resource-policy/main.tf rename to modules/compute/resource-policy/main.tf diff --git a/modules/compute/resource-policy/resource-policy/metadata.yaml b/modules/compute/resource-policy/metadata.yaml similarity index 100% rename from modules/compute/resource-policy/resource-policy/metadata.yaml rename to modules/compute/resource-policy/metadata.yaml diff --git a/modules/compute/resource-policy/resource-policy/outputs.tf b/modules/compute/resource-policy/outputs.tf similarity index 100% rename from modules/compute/resource-policy/resource-policy/outputs.tf rename to modules/compute/resource-policy/outputs.tf diff --git a/modules/compute/resource-policy/resource-policy/variables.tf b/modules/compute/resource-policy/variables.tf similarity index 100% rename from modules/compute/resource-policy/resource-policy/variables.tf rename to modules/compute/resource-policy/variables.tf diff --git a/modules/compute/resource-policy/resource-policy/versions.tf b/modules/compute/resource-policy/versions.tf similarity index 100% rename from modules/compute/resource-policy/resource-policy/versions.tf rename to modules/compute/resource-policy/versions.tf From 1911eb464d9d3b765070c7097c8e1f8471c7a5f9 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Wed, 9 Oct 2024 00:45:17 +0000 Subject: [PATCH 065/102] kueue config file renamed --- ...ntials.yaml.tftpl => kueue-xpk-configuration.yaml.tftpl} | 0 community/examples/xpk-gke-a3-megagpu.yaml | 2 +- modules/compute/gke-node-pool/outputs.tf | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) rename community/examples/xpk-gke-a3-megagpu-files/{kueue-credentials.yaml.tftpl => kueue-xpk-configuration.yaml.tftpl} (100%) diff --git a/community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl similarity index 100% rename from community/examples/xpk-gke-a3-megagpu-files/kueue-credentials.yaml.tftpl rename to community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml index 999a52ea7b..a5075d8339 100644 --- a/community/examples/xpk-gke-a3-megagpu.yaml +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -114,5 +114,5 @@ deployment_groups: apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"} - - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-credentials.yaml.tftpl + - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl template_vars: {num_chips: "16"} diff --git a/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf index 8be6a2772a..75b63572b1 100644 --- a/modules/compute/gke-node-pool/outputs.tf +++ b/modules/compute/gke-node-pool/outputs.tf @@ -73,9 +73,9 @@ locals { } gpu_direct_instruction = <<-EOT Since you are using ${var.machine_type} machine type that has GPUDirect support, your nodepool had been configured with the required plugins. - To fully utilize GPUDirect you will need to add the some components into your workload manifest. Details below: + To fully utilize GPUDirect you will need to add some components into your workload manifest. Details below: - A sample GKE job that had GPUDirect enabled and NCCL test included has been generated locally at: + A sample GKE job that has GPUDirect enabled and NCCL test included has been generated locally at: ${abspath(local.gpu_direct_setting.updated_workload_path)} You can use the following commands to submit the sample job: @@ -85,7 +85,7 @@ locals { export WORKLOAD_PATH=<> python3 ${abspath("${path.module}/gpu-direct-workload/scripts/${lookup(local.script_path, var.machine_type, "")}")} --file $WORKLOAD_PATH --rxdm ${local.gpu_direct_setting.rxdm_version} **WARNING** - The "--rxdm" version is tide to the nccl-tcpx/o-installer that had been deployed to your cluster, changing it to other value might have impact on performance + The "--rxdm" version is tied to the nccl-tcpx/o-installer that had been deployed to your cluster, changing it to other value might have impact on performance **WARNING** Or you can also follow our GPUDirect user guide to update your workload From d9e131e6ab26e41146f6879ef9105da840a4c1fe Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Thu, 10 Oct 2024 12:01:24 +0000 Subject: [PATCH 066/102] kueue config set using a template file --- community/examples/xpk-gke-a3-megagpu.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml index a5075d8339..35ac0df71c 100644 --- a/community/examples/xpk-gke-a3-megagpu.yaml +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -63,13 +63,13 @@ deployment_groups: - id: group_placement_0 source: modules/compute/resource-policy settings: - name: xpk-gke-a3-megagpu-gp-np-0 + name: $(vars.deployment_name)-gp-np-0 group_placement_max_distance: 2 - id: group_placement_1 source: modules/compute/resource-policy settings: - name: xpk-gke-a3-megagpu-gp-np-0 + name: $(vars.deployment_name)-gp-np-0 group_placement_max_distance: 2 - id: a3_megagpu_pool_0 @@ -94,12 +94,14 @@ deployment_groups: host_maintenance_interval: PERIODIC outputs: [instructions] - - id: workload_manager_install + - id: workload_component_install source: modules/management/kubectl-apply use: [gke_cluster] settings: kueue: install: true + config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl + config_template_vars: {num_chips: "16"} jobset: install: true @@ -107,12 +109,10 @@ deployment_groups: source: community/modules/compute/gke-topology-scheduler use: [gke_cluster] - - id: workload_manager_config + - id: workload_configmap source: modules/management/kubectl-apply use: [gke_cluster] settings: apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"} - - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl - template_vars: {num_chips: "16"} From 870058336eba1096583b87051b4c41b8256e80d4 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 11 Oct 2024 15:49:15 +0000 Subject: [PATCH 067/102] configmap variable fixed --- .../config-map.yaml.tftpl | 2 +- community/examples/xpk-gke-a3-megagpu.yaml | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl index 100058b7be..900d30729c 100644 --- a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl +++ b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl @@ -3,4 +3,4 @@ apiVersion: v1 metadata: name: ${name} data: - h100-mega-80gb-8: ${num_nodes} + h100-mega-80gb-8: "${num_nodes}" diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml index 35ac0df71c..377bf63b83 100644 --- a/community/examples/xpk-gke-a3-megagpu.yaml +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -45,7 +45,7 @@ deployment_groups: settings: network_name_prefix: $(vars.deployment_name)-gpunet global_ip_address_range: 192.169.0.0/16 - network_count: 4 + network_count: 8 subnetwork_cidr_suffix: 24 - id: gke_cluster @@ -76,9 +76,10 @@ deployment_groups: source: modules/compute/gke-node-pool use: [gke_cluster, gpunets, group_placement_0] settings: + name: a3-megagpu-pool-0 machine_type: a3-megagpu-8g - autoscaling_total_min_nodes: 1 - initial_node_count: 1 + autoscaling_total_min_nodes: 2 + initial_node_count: 2 zones: [$(vars.zone)] host_maintenance_interval: PERIODIC outputs: [instructions] @@ -87,9 +88,10 @@ deployment_groups: source: modules/compute/gke-node-pool use: [gke_cluster, gpunets, group_placement_1] settings: + name: a3-megagpu-pool-1 machine_type: a3-megagpu-8g - autoscaling_total_min_nodes: 1 - initial_node_count: 1 + autoscaling_total_min_nodes: 2 + initial_node_count: 2 zones: [$(vars.zone)] host_maintenance_interval: PERIODIC outputs: [instructions] @@ -101,7 +103,7 @@ deployment_groups: kueue: install: true config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl - config_template_vars: {num_chips: "16"} + config_template_vars: {num_chips: "32"} jobset: install: true @@ -115,4 +117,4 @@ deployment_groups: settings: apply_manifests: - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl - template_vars: {name: "xpk-gke-a3-megagpu-configmap", num_nodes: "2"} + template_vars: {name: "xpk-gke-a3-megagpu-resources-configmap", num_nodes: "4"} From 1fbd810e9383901aeba10a107cd1e2ceb09e2685 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 11 Oct 2024 20:16:36 +0000 Subject: [PATCH 068/102] deployment group name fixed --- community/examples/xpk-gke-a3-megagpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml index 377bf63b83..21dda328df 100644 --- a/community/examples/xpk-gke-a3-megagpu.yaml +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -69,7 +69,7 @@ deployment_groups: - id: group_placement_1 source: modules/compute/resource-policy settings: - name: $(vars.deployment_name)-gp-np-0 + name: $(vars.deployment_name)-gp-np-1 group_placement_max_distance: 2 - id: a3_megagpu_pool_0 From 79f32bc5b8871b51d9e2051e2a361c5e2919ef0a Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 11 Oct 2024 20:52:59 +0000 Subject: [PATCH 069/102] public cluster disabled --- community/examples/xpk-gke-a3-megagpu.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml index 21dda328df..7830b4446a 100644 --- a/community/examples/xpk-gke-a3-megagpu.yaml +++ b/community/examples/xpk-gke-a3-megagpu.yaml @@ -52,11 +52,9 @@ deployment_groups: source: modules/scheduler/gke-cluster use: [network1, gpunets] settings: - enable_private_endpoint: false # Allows for access from authorized public IPs master_authorized_networks: - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. display_name: "kubectl-access-network" - #min_master_version: "1.29.6-gke.1326000" # (TODO: Ask MaxText the reason to set this) Couldn't find this version in the valid master versions in australia-southeast1-c. Can be left unset to be set by GKE to the version of the most recent official release. system_node_pool_machine_type: "e2-standard-32" outputs: [instructions] From 0ea478daf01b7ef49214a1cdcbd95418ebf3314e Mon Sep 17 00:00:00 2001 From: Oriol Vilarrubi Date: Wed, 28 Aug 2024 17:53:11 +0200 Subject: [PATCH 070/102] Use sackd for the login nodes Substitute slurmd for the sackd daemon, this way an x-login partition is not needed. --- .../modules/slurm_files/scripts/conf.py | 22 +------------------ .../modules/slurm_files/scripts/setup.py | 14 +++++------- .../modules/slurm_files/scripts/slurmsync.py | 2 +- 3 files changed, 8 insertions(+), 30 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 29b4076056..120ae7f1e8 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -27,7 +27,6 @@ # This file is managed by a script. Manual modifications will be overwritten. """ -login_nodeset = "x-login" def dict_to_conf(conf, delim=" ") -> str: @@ -130,24 +129,6 @@ def get(key, default): return dict_to_conf(conf_options, delim="\n") -def loginlines() -> str: - nodeset = { - "NodeSet": login_nodeset, - "Feature": login_nodeset, - } - partition = { - "PartitionName": login_nodeset, - "Nodes": login_nodeset, - "State": "UP", - "DefMemPerCPU": 1, - "Hidden": "YES", - "RootOnly": "YES", - } - lines = [ - dict_to_conf(nodeset), - dict_to_conf(partition), - ] - return "\n".join(lines) def nodeset_lines(nodeset, lkp: util.Lookup) -> str: @@ -254,7 +235,7 @@ def suspend_exc_lines(lkp: util.Lookup) -> Iterable[str]: for p in lkp.cfg.partitions.values() if len(p.partition_nodeset_dyn) > 0 ] - suspend_exc_parts = {"SuspendExcParts": [login_nodeset, *dyn_parts]} + suspend_exc_parts = {"SuspendExcParts": [*dyn_parts]} return filter( None, @@ -270,7 +251,6 @@ def make_cloud_conf(lkp: util.Lookup) -> str: lines = [ FILE_PREAMBLE, conflines(lkp), - loginlines(), *(nodeset_lines(n, lkp) for n in lkp.cfg.nodeset.values()), *(nodeset_dyn_lines(n) for n in lkp.cfg.nodeset_dyn.values()), *(nodeset_tpu_lines(n, lkp) for n in lkp.cfg.nodeset_tpu.values()), diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 589cfeadef..5e3d8b9542 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -366,27 +366,25 @@ def setup_login(): slurmctld_host = f"{lookup().control_host}" if lookup().control_addr: slurmctld_host = f"{lookup().control_host}({lookup().control_addr})" - slurmd_options = [ + sackd_options = [ f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', - f'--conf="Feature={conf.login_nodeset}"', - "-Z", ] - sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'""" - update_system_config("slurmd", sysconf) + sysconf = f"""SACKD_OPTIONS='{" ".join(sackd_options)}'""" + update_system_config("sackd", sysconf) install_custom_scripts() setup_network_storage() setup_sudoers() run("systemctl restart munge") - run("systemctl enable slurmd", timeout=30) - run("systemctl restart slurmd", timeout=30) + run("systemctl enable sackd", timeout=30) + run("systemctl restart sackd", timeout=30) run("systemctl enable --now slurmcmd.timer", timeout=30) run_custom_scripts() log.info("Check status of cluster services") run("systemctl status munge", timeout=30) - run("systemctl status slurmd", timeout=30) + run("systemctl status sackd", timeout=30) log.info("Done setting up login") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 0d5f0e6798..5975d68e20 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -428,7 +428,7 @@ def reconfigure_slurm(): log.exception("failed to reconfigure slurmctld") util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") - elif lookup().instance_role_safe in ["compute", "login"]: + elif lookup().instance_role_safe == "compute": log.info("Restarting slurmd to make changes take effect.") run("systemctl restart slurmd") util.run(f"wall '{update_msg}'", timeout=30) From b59f80c204950ef89e3b5c9b545441b042107bb8 Mon Sep 17 00:00:00 2001 From: Oriol Vilarrubi Date: Thu, 29 Aug 2024 22:44:49 +0200 Subject: [PATCH 071/102] Add sackd automatic restart in reconfigure --- .../modules/slurm_files/scripts/setup.py | 1 + .../modules/slurm_files/scripts/slurmsync.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 5e3d8b9542..37532f6285 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -216,6 +216,7 @@ def setup_sudoers(): content = """ # Allow SlurmUser to manage the slurm daemons slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service +slurm ALL= NOPASSWD: /usr/bin/systemctl restart sackd.service slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service """ sudoers_file = Path("/etc/sudoers.d/slurm") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 5975d68e20..112e2d5748 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -433,6 +433,11 @@ def reconfigure_slurm(): run("systemctl restart slurmd") util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") + elif lookup().instance_role_safe == "login": + log.info("Restarting sackd to make changes take effect.") + run("systemctl restart sackd") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") def update_topology(lkp: util.Lookup) -> None: From 12c68cb28c1ebd79675c2481dede77164ba00d1c Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 11 Oct 2024 21:37:56 +0000 Subject: [PATCH 072/102] gke-node-pool default name conflict fixed --- modules/compute/gke-node-pool/README.md | 5 ++++- modules/compute/gke-node-pool/main.tf | 6 +++++- modules/compute/gke-node-pool/variables.tf | 2 +- modules/compute/gke-node-pool/versions.tf | 4 ++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 03652cf29e..ec0365556e 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -248,6 +248,7 @@ limitations under the License. | [google](#requirement\_google) | ~> 5.0 | | [google-beta](#requirement\_google-beta) | ~> 5.0 | | [null](#requirement\_null) | ~> 3.0 | +| [random](#requirement\_random) | 3.6.3 | ## Providers @@ -256,6 +257,7 @@ limitations under the License. | [google](#provider\_google) | ~> 5.0 | | [google-beta](#provider\_google-beta) | ~> 5.0 | | [null](#provider\_null) | ~> 3.0 | +| [random](#provider\_random) | 3.6.3 | ## Modules @@ -277,6 +279,7 @@ limitations under the License. | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [random_id.nodepool_name_suffix](https://registry.terraform.io/providers/hashicorp/random/3.6.3/docs/resources/id) | resource | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | @@ -304,7 +307,7 @@ limitations under the License. | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | -| [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | +| [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type and a suffix with a random string. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index f391532976..a9b8784a38 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -39,10 +39,14 @@ data "google_compute_default_service_account" "default_sa" { project = var.project_id } +resource "random_id" "nodepool_name_suffix" { + byte_length = 8 +} + resource "google_container_node_pool" "node_pool" { provider = google-beta - name = var.name == null ? var.machine_type : var.name + name = var.name == null ? "${var.machine_type}-${random_id.nodepool_name_suffix.hex}" : var.name cluster = var.cluster_id node_locations = var.zones diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index b24aef91df..069b82393f 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -31,7 +31,7 @@ variable "zones" { } variable "name" { - description = "The name of the node pool. If left blank, will default to the machine type." + description = "The name of the node pool. If left blank, will default to the machine type and a suffix with a random string." type = string default = null } diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 2a27bfc342..d3a6076ed9 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -28,6 +28,10 @@ terraform { source = "hashicorp/null" version = "~> 3.0" } + random = { + source = "hashicorp/random" + version = "3.6.3" + } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0" From 303d676cf33e9eb55fe331f675d140c5860ca26b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 11 Oct 2024 20:58:03 +0000 Subject: [PATCH 073/102] Remove mention of `./[community/]modules` from docs and examples * Remove mention of `./[community/]modules` from docs and examples; * Added line about not using it with toolkit modules; * Clean up outdated mentions of "role"; * Update recommendations for `startup-script.source` to use `ghpc_stage`. --- .../examples/hpc-slurm-ramble-gromacs.yaml | 1 - .../hpc-slurm-ubuntu2004-v5-legacy.yaml | 1 - community/examples/hpc-slurm-ubuntu2004.yaml | 1 - community/examples/htc-slurm-v5-legacy.yaml | 1 - community/examples/htc-slurm.yaml | 1 - .../examples/tutorial-starccm-slurm.yaml | 1 - .../README.md | 4 +- .../schedmd-slurm-gcp-v5-hybrid/README.md | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../blueprints/create-networks.yaml | 1 - docs/network_storage.md | 2 +- docs/tutorials/hpc-slurm-qwiklabs.yaml | 1 - examples/README.md | 51 ++++--------------- examples/hpc-enterprise-slurm-v5-legacy.yaml | 1 - examples/hpc-enterprise-slurm.yaml | 1 - examples/hpc-slurm-v5-legacy.yaml | 1 - examples/hpc-slurm.yaml | 1 - modules/README.md | 2 +- .../scheduler/batch-job-template/README.md | 2 +- modules/scripts/startup-script/README.md | 17 +++---- pkg/config/config_test.go | 3 -- .../daily-tests/validate_tests_metadata.py | 2 +- .../configs/versioned_blueprint.yaml | 1 - .../test_configs/2-network-interfaces.yaml | 13 +++-- .../test_configs/2filestore-4instances.yaml | 6 +-- .../test_configs/apt-collision.yaml | 3 +- .../test_configs/centos8-ss.yaml | 8 +-- .../cloud-batch-cft-instance-template.yaml | 2 +- .../test_configs/debian-ss.yaml | 8 +-- .../test_configs/exascaler-existing-vpc.yaml | 2 +- .../test_configs/exascaler-new-vpc.yaml | 2 +- .../test_configs/gpu-v5-legacy.yaml | 5 +- tools/validate_configs/test_configs/gpu.yaml | 5 +- .../test_configs/hpc-centos-ss.yaml | 8 +-- .../test_configs/instance-with-startup.yaml | 2 +- .../test_configs/new_project.yaml | 2 +- .../test_configs/nfs-servers.yaml | 4 +- .../test_configs/rocky-ss.yaml | 8 +-- .../test_configs/simple-startup.yaml | 8 +-- .../test_configs/spack-buildcache.yaml | 2 +- .../test_configs/spack-environments.yaml | 2 +- .../test_configs/startup-options.yaml | 12 ++--- .../test_configs/test_outputs.yaml | 4 +- .../test_configs/threads_per_core.yaml | 21 ++++---- .../test_configs/timeout_test.yaml | 6 +-- .../test_configs/ubuntu-ss.yaml | 8 +-- .../test_configs/vm-instance-local-ssd.yaml | 6 +-- tools/validate_configs/test_configs/vm.yaml | 4 +- 48 files changed, 100 insertions(+), 151 deletions(-) diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 523b543c53..7a552c3477 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -31,7 +31,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/vpc diff --git a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml index 6b1875353a..916fcde74b 100644 --- a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml +++ b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml @@ -34,7 +34,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 7e89520c05..34037a1052 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -33,7 +33,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/examples/htc-slurm-v5-legacy.yaml b/community/examples/htc-slurm-v5-legacy.yaml index d7ff1eccd3..1089cf9904 100644 --- a/community/examples/htc-slurm-v5-legacy.yaml +++ b/community/examples/htc-slurm-v5-legacy.yaml @@ -42,7 +42,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/vpc diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 7165923bbb..9ba26025d7 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -42,7 +42,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - id: network source: modules/network/vpc diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index ce8dd0817f..b74eb44d33 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -32,7 +32,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 8db3950334..f0fb08ee1d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -17,7 +17,7 @@ be accessed as `tpu` partition. ```yaml - id: tpu_nodeset - source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: node_type: v2-8 @@ -27,7 +27,7 @@ be accessed as `tpu` partition. preserve_tpu: false - id: tpu_partition - source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [tpu_nodeset] settings: partition_name: tpu diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 9822d36eab..1d62cd393d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -73,7 +73,7 @@ The hybrid module can be added to a blueprint as follows: ```yaml - id: slurm-controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid + source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid use: - debug-partition - compute-partition diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index a9d801d8c7..45d662f7d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -134,7 +134,7 @@ example: ```yaml - id: controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: [ network, partition ] settings: enable_slurm_gcp_plugins: diff --git a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml index 8c880d1c4c..19f1601f35 100644 --- a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml @@ -28,7 +28,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network0 source: modules/network/vpc settings: diff --git a/docs/network_storage.md b/docs/network_storage.md index e5f8903eb7..28a39594d6 100644 --- a/docs/network_storage.md +++ b/docs/network_storage.md @@ -41,7 +41,7 @@ as shown below: settings: {local_mount: /home} - id: workstation - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, homefs] # Note this line ``` diff --git a/docs/tutorials/hpc-slurm-qwiklabs.yaml b/docs/tutorials/hpc-slurm-qwiklabs.yaml index 3a51b77f7a..f4bfc81941 100644 --- a/docs/tutorials/hpc-slurm-qwiklabs.yaml +++ b/docs/tutorials/hpc-slurm-qwiklabs.yaml @@ -30,7 +30,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/vpc diff --git a/examples/README.md b/examples/README.md index 0275fd930c..cc3e7a4412 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1730,10 +1730,9 @@ vars: deployment_groups: - group: groupName modules: - - # Local source, prefixed with ./ (/ and ../ also accepted) + # Embedded module (part of the toolkit), prefixed with `modules/` or `community/modules` - id: # Required: Name of this module used to uniquely identify it. - source: ./modules/role/module-name # Required: Points to the module directory. + source: modules/role/module-name # Required kind: < terraform | packer > # Optional: Type of module, currently choose from terraform or packer. If not specified, `kind` will default to `terraform` # Optional: All configured settings for the module. For terraform, each # variable listed in variables.tf can be set here, and are mandatory if no @@ -1747,14 +1746,18 @@ deployment_groups: key3a: value3a key3b: value3b - # Embedded module (part of the toolkit), prefixed with modules/ - - source: modules/role/module-name - # GitHub module over SSH, prefixed with git@github.com - - source: git@github.com:org/repo.git//modules/role/module-name + - source: git@github.com:org/repo.git//path/to/module # GitHub module over HTTPS, prefixed with github.com - - source: github.com/org/repo//modules/role/module-name + - source: github.com/org/repo//path/to/module + + # Local absolute source, prefixed with / + - source: /path/to/module + + # Local relative (to current working directory) source, prefixed with ./ or ../ + - source: ../path/to/module + # NOTE: Do not reference toolkit modules by local source, use embedded source instead. ``` ## Writing an HPC Blueprint @@ -1847,38 +1850,6 @@ When possible, custom modules should use these roles so that they match other modules defined by the toolkit. If a custom module does not fit into these roles, a new role can be defined. -A module's parent folder will define the module’s role if possible. Therefore, -regardless of where the module is located, the module directory should be -explicitly referenced at least 2 layers deep, where the top layer refers to the -“role” of that module. - -If a module is not defined at least 2 layers deep and the `ghpc_role` label has -not been explicitly set in settings, ghpc_role will default to `undefined`. - -Below we show some of the core modules and their roles (as parent folders). - -```text -modules/ -└── < - └── <> - -modules/ -├── compute -│ └── vm-instance -├── file-system -│ ├── pre-existing-network-storage -│ └── filestore -├── monitoring -│ └── dashboard -├── network -│ ├── pre-existing-vpc -│ └── vpc -├── packer -│ └── custom-image -└── scripts - └── startup-script -``` - ### Deployment Groups Deployment groups allow distinct sets of modules to be defined and deployed as a diff --git a/examples/hpc-enterprise-slurm-v5-legacy.yaml b/examples/hpc-enterprise-slurm-v5-legacy.yaml index 7c79b818ec..e482a10d15 100644 --- a/examples/hpc-enterprise-slurm-v5-legacy.yaml +++ b/examples/hpc-enterprise-slurm-v5-legacy.yaml @@ -53,7 +53,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index d7520d3b85..69aeab57dc 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -46,7 +46,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/vpc diff --git a/examples/hpc-slurm-v5-legacy.yaml b/examples/hpc-slurm-v5-legacy.yaml index 4a5277ee3b..234277208d 100644 --- a/examples/hpc-slurm-v5-legacy.yaml +++ b/examples/hpc-slurm-v5-legacy.yaml @@ -30,7 +30,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index 0a90bdcc89..0736772569 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -30,7 +30,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/vpc diff --git a/modules/README.md b/modules/README.md index defba11446..e4974e3264 100644 --- a/modules/README.md +++ b/modules/README.md @@ -350,7 +350,7 @@ following module definition refers the local pre-existing-vpc modules. ```yaml - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc ``` > **_NOTE:_** Relative paths (beginning with `.` or `..` must be relative to the diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 2bbb68fb07..a3a0b176b6 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -94,7 +94,7 @@ deployment_groups: source_image_project: cloud-hpc-image-public - id: batch-job - source: ./modules/scheduler/batch-job-template + source: modules/scheduler/batch-job-template settings: instance_template: $(batch-compute-template.self_link) outputs: [instructions] diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index e7f696d08f..c7dc178ae3 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -29,12 +29,11 @@ Each runner receives the following attributes: not. - `source`: (Optional) A path to the file or data you want to upload. Must be defined if `content` is not. The source path is relative to the deployment - group directory. Scripts distributed as part of modules should start with - `modules/` followed by the name of the module used (not to be confused with - the module ID) and the path to the script. The format is shown below: + group directory. To ensure correctness of path use `ghpc_stage` function, that + would copy referenced file to the deployment group directory. For example: - ```text - source: ./modules/<>/<> + ```yaml + source: $(ghpc_stage("path/to/file")) ``` For more examples with context, see the @@ -188,7 +187,7 @@ For official documentation see troubleshooting docs: ```yaml - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: # Some modules such as filestore have runners as outputs for convenience: @@ -212,7 +211,7 @@ For official documentation see troubleshooting docs: args: "bar.tgz 'Expanding file'" - id: compute-cluster - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [homefs, startup] ``` @@ -222,13 +221,13 @@ they are able to do so by using the `gcs_bucket_path` as shown in the below exam ```yaml - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: gcs_bucket_path: gs://user-test-bucket/folder1/folder2 install_stackdriver_agent: true - id: compute-cluster - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [startup] ``` diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 8c239a7971..2caabf2697 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -537,9 +537,6 @@ func (s *zeroSuite) TestCheckMovedModules(c *C) { // embedded moved c.Check(checkMovedModule("community/modules/scheduler/cloud-batch-job"), NotNil) - - // local moved - c.Assert(checkMovedModule("./community/modules/scheduler/cloud-batch-job"), NotNil) } func (s *zeroSuite) TestCheckStringLiteral(c *C) { diff --git a/tools/cloud-build/daily-tests/validate_tests_metadata.py b/tools/cloud-build/daily-tests/validate_tests_metadata.py index 5f0e60bb66..ee9f4ed6d9 100644 --- a/tools/cloud-build/daily-tests/validate_tests_metadata.py +++ b/tools/cloud-build/daily-tests/validate_tests_metadata.py @@ -42,7 +42,7 @@ def module_tag(src: str) -> Optional[str]: Remote sources are not supported (None). Ex: "modules/network/vpc" -> "m.vpc" """ - if not src.startswith(("modules/", "community/modules/", "./modules/", "./community/modules/")): + if not src.startswith(("modules/", "community/modules/")): return None return f"m.{os.path.basename(src)}" diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index 6344dd8d76..5240404a3c 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -48,7 +48,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/vpc diff --git a/tools/validate_configs/test_configs/2-network-interfaces.yaml b/tools/validate_configs/test_configs/2-network-interfaces.yaml index f721e06893..dce54ba04c 100644 --- a/tools/validate_configs/test_configs/2-network-interfaces.yaml +++ b/tools/validate_configs/test_configs/2-network-interfaces.yaml @@ -27,7 +27,6 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: default-network source: modules/network/pre-existing-vpc @@ -52,7 +51,7 @@ deployment_groups: # Test adding a pre-existing network via "use" - id: one-used-existing-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - default-network settings: @@ -61,7 +60,7 @@ deployment_groups: # Test adding a newly created network via "use" - id: one-used-new-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - new-network-1 settings: @@ -70,7 +69,7 @@ deployment_groups: # Test adding one pre-existing network via "network_interfaces" - id: one-explicit-existing-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance settings: name_prefix: one-explicit-existing-ni machine_type: n2-standard-2 @@ -88,7 +87,7 @@ deployment_groups: # Test adding one newly created network via "network_interfaces" - id: one-explicit-new-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance settings: name_prefix: one-explicit-new-ni machine_type: n2-standard-2 @@ -106,7 +105,7 @@ deployment_groups: # Test adding both a pre-existing network and a newly created network via "network_interfaces" - id: two-explicit-mixed-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance settings: name_prefix: two-explicit-mixed-ni network_interfaces: @@ -136,7 +135,7 @@ deployment_groups: # Test adding two newly created networks via "network_interfaces" - id: two-explicit-new-ni - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance settings: name_prefix: two-explicit-new-ni network_interfaces: diff --git a/tools/validate_configs/test_configs/2filestore-4instances.yaml b/tools/validate_configs/test_configs/2filestore-4instances.yaml index 39fc4ad6cb..4239580c43 100644 --- a/tools/validate_configs/test_configs/2filestore-4instances.yaml +++ b/tools/validate_configs/test_configs/2filestore-4instances.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: infrastructure modules: - id: network - source: ./modules/network/vpc + source: modules/network/vpc - id: homefs source: modules/file-system/filestore @@ -38,7 +38,7 @@ deployment_groups: ghpc_role: storage-home - id: apps - source: ./modules/file-system/filestore + source: modules/file-system/filestore use: [network] settings: name: apps @@ -47,7 +47,7 @@ deployment_groups: ghpc_role: storage-apps - id: license-server-1 - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network, homefs] settings: name_prefix: ls1 diff --git a/tools/validate_configs/test_configs/apt-collision.yaml b/tools/validate_configs/test_configs/apt-collision.yaml index 9ab7a7e8a3..987fa2c159 100644 --- a/tools/validate_configs/test_configs/apt-collision.yaml +++ b/tools/validate_configs/test_configs/apt-collision.yaml @@ -32,13 +32,12 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc ## Network - source: modules/network/vpc kind: terraform id: network1 - - source: ./modules/scripts/startup-script + - source: modules/scripts/startup-script kind: terraform id: startup settings: diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index 30a25c1728..ede36e2f10 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: appsfs source: modules/file-system/filestore @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -58,7 +58,7 @@ deployment_groups: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: - type: data @@ -74,7 +74,7 @@ deployment_groups: - $(spack-execute.spack_runner) - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml index 55446e69b6..2df9ca1276 100644 --- a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml +++ b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml @@ -59,7 +59,7 @@ deployment_groups: source_image_project: cloud-hpc-image-public - id: batch-job - source: ./modules/scheduler/batch-job-template + source: modules/scheduler/batch-job-template use: [network1, appfs, batch-startup-script] settings: runnable: "cat /sw/hello.txt" diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index b2a4a3e515..4cf9b7fad8 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: appsfs source: modules/file-system/filestore @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -58,7 +58,7 @@ deployment_groups: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: - type: data @@ -74,7 +74,7 @@ deployment_groups: - $(spack-execute.spack_runner) - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml index 3215ab4e1c..3476e7e10b 100644 --- a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml +++ b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: scratchfs source: community/modules/file-system/DDN-EXAScaler diff --git a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml index 936ab51aa1..cca09d56c5 100644 --- a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml +++ b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/vpc - id: scratchfs - source: ./community/modules/file-system/DDN-EXAScaler + source: community/modules/file-system/DDN-EXAScaler use: [network1] settings: local_mount: /scratch diff --git a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml b/tools/validate_configs/test_configs/gpu-v5-legacy.yaml index 611ec58e43..16f4a9fde8 100644 --- a/tools/validate_configs/test_configs/gpu-v5-legacy.yaml +++ b/tools/validate_configs/test_configs/gpu-v5-legacy.yaml @@ -62,12 +62,11 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc - id: nogpu-n1 - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - network1 settings: @@ -76,7 +75,7 @@ deployment_groups: instance_image: $(vars.instance_image_vm) - id: manual-n1 - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - network1 settings: diff --git a/tools/validate_configs/test_configs/gpu.yaml b/tools/validate_configs/test_configs/gpu.yaml index e8064e1534..f12bd323f1 100644 --- a/tools/validate_configs/test_configs/gpu.yaml +++ b/tools/validate_configs/test_configs/gpu.yaml @@ -62,12 +62,11 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network source: modules/network/pre-existing-vpc - id: nogpu-n1 - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - network settings: @@ -76,7 +75,7 @@ deployment_groups: instance_image: $(vars.instance_image_vm) - id: manual-n1 - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: - network settings: diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index 076afa9052..6937edc13d 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: appsfs source: modules/file-system/filestore @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -58,7 +58,7 @@ deployment_groups: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: - type: data @@ -74,7 +74,7 @@ deployment_groups: - $(spack-execute.spack_runner) - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/instance-with-startup.yaml b/tools/validate_configs/test_configs/instance-with-startup.yaml index fd6fb76c2e..caeb85ae63 100644 --- a/tools/validate_configs/test_configs/instance-with-startup.yaml +++ b/tools/validate_configs/test_configs/instance-with-startup.yaml @@ -43,6 +43,6 @@ deployment_groups: machine_type: e2-standard-8 - id: wait - source: ./community/modules/scripts/wait-for-startup + source: community/modules/scripts/wait-for-startup settings: instance_name: $(workstation.name[0]) diff --git a/tools/validate_configs/test_configs/new_project.yaml b/tools/validate_configs/test_configs/new_project.yaml index 069ac9587d..6a352b169d 100644 --- a/tools/validate_configs/test_configs/new_project.yaml +++ b/tools/validate_configs/test_configs/new_project.yaml @@ -24,7 +24,7 @@ deployment_groups: - group: primary modules: - id: project - source: ./community/modules/project/new-project + source: community/modules/project/new-project settings: folder_id: 334688113020 # random number billing_account: 111110-M2N704-854685 # random billing number diff --git a/tools/validate_configs/test_configs/nfs-servers.yaml b/tools/validate_configs/test_configs/nfs-servers.yaml index 676d8feb59..126ac220e5 100644 --- a/tools/validate_configs/test_configs/nfs-servers.yaml +++ b/tools/validate_configs/test_configs/nfs-servers.yaml @@ -37,7 +37,7 @@ deployment_groups: auto_delete_disk: true - id: appsfs - source: ./community/modules/file-system/nfs-server + source: community/modules/file-system/nfs-server use: [network1] outputs: [network_storage] settings: @@ -45,7 +45,7 @@ deployment_groups: auto_delete_disk: true - id: multiple-local-mounts - source: ./community/modules/file-system/nfs-server + source: community/modules/file-system/nfs-server use: [network1] outputs: [network_storage] settings: diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index 53a5c17cc7..c7bc912dbf 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: appsfs source: modules/file-system/filestore @@ -45,7 +45,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -61,7 +61,7 @@ deployment_groups: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: - type: data @@ -77,7 +77,7 @@ deployment_groups: - $(spack-execute.spack_runner) - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/simple-startup.yaml b/tools/validate_configs/test_configs/simple-startup.yaml index 284a1e3855..8c4016eb1d 100644 --- a/tools/validate_configs/test_configs/simple-startup.yaml +++ b/tools/validate_configs/test_configs/simple-startup.yaml @@ -26,10 +26,10 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: runners: - type: data @@ -44,12 +44,12 @@ deployment_groups: args: "foo.tgz 'Expanding the file'" - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup] settings: machine_type: e2-standard-4 - id: waiter - source: ./community/modules/scripts/wait-for-startup + source: community/modules/scripts/wait-for-startup settings: instance_name: $(instance.name[0]) diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index aaba729742..13f2930f05 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/pre-existing-vpc - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index 94a86065f4..eb260a915f 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/pre-existing-vpc - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack diff --git a/tools/validate_configs/test_configs/startup-options.yaml b/tools/validate_configs/test_configs/startup-options.yaml index 2d4fd1b303..4ca1555cd7 100644 --- a/tools/validate_configs/test_configs/startup-options.yaml +++ b/tools/validate_configs/test_configs/startup-options.yaml @@ -26,10 +26,10 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: ansible_virtualenv_path: /usr/local/ghpc runners: @@ -48,7 +48,7 @@ deployment_groups: destination: empty_tasks.yaml - id: instance-explicit-startup - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1] settings: name_prefix: explicit @@ -56,21 +56,21 @@ deployment_groups: startup_script: $(startup.startup_script) - id: instance-no-startup - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1] settings: name_prefix: no-startup machine_type: e2-standard-4 - id: instance-use-startup - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup] settings: name_prefix: use-startup machine_type: e2-standard-4 - id: instance-metadata-startup - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1] settings: name_prefix: metadata-startup diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 5de0b7bc21..6bb0bb48d2 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -47,7 +47,7 @@ deployment_groups: - install_nfs_client - id: nfs - source: ./community/modules/file-system/nfs-server + source: community/modules/file-system/nfs-server outputs: - network_storage - install_nfs_client @@ -131,7 +131,7 @@ deployment_groups: - startup_script - id: lustre - source: ./community/modules/file-system/DDN-EXAScaler + source: community/modules/file-system/DDN-EXAScaler outputs: - private_addresses - ssh_console diff --git a/tools/validate_configs/test_configs/threads_per_core.yaml b/tools/validate_configs/test_configs/threads_per_core.yaml index de06cab879..ed513aedfa 100644 --- a/tools/validate_configs/test_configs/threads_per_core.yaml +++ b/tools/validate_configs/test_configs/threads_per_core.yaml @@ -27,13 +27,12 @@ deployment_groups: modules: # Source is an embedded module, denoted by "modules/*" without ./, ../, / # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc kind: terraform - id: n1-2-threads - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -43,7 +42,7 @@ deployment_groups: threads_per_core: 2 - id: n1-1-thread - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -53,7 +52,7 @@ deployment_groups: threads_per_core: 1 - id: n1-0-threads - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -63,7 +62,7 @@ deployment_groups: threads_per_core: 0 - id: n1-null-threads - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -73,7 +72,7 @@ deployment_groups: threads_per_core: null - id: n2-2-threads - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -83,7 +82,7 @@ deployment_groups: threads_per_core: 2 - id: n2-1-thread - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -93,7 +92,7 @@ deployment_groups: threads_per_core: 1 - id: c2-2-threads - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -103,7 +102,7 @@ deployment_groups: threads_per_core: 2 - id: c2-1-thread - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -113,7 +112,7 @@ deployment_groups: threads_per_core: 1 - id: e2-medium-0-thread - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 @@ -123,7 +122,7 @@ deployment_groups: threads_per_core: 0 - id: e2-medium-null-thread - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance kind: terraform use: - network1 diff --git a/tools/validate_configs/test_configs/timeout_test.yaml b/tools/validate_configs/test_configs/timeout_test.yaml index 95305e7923..70a9d37517 100644 --- a/tools/validate_configs/test_configs/timeout_test.yaml +++ b/tools/validate_configs/test_configs/timeout_test.yaml @@ -32,21 +32,21 @@ deployment_groups: source: modules/network/vpc - id: gcs - source: ./modules/file-system/pre-existing-network-storage + source: modules/file-system/pre-existing-network-storage settings: remote_mount: hpc-toolkit-service-catalog-solutions local_mount: /catalog fs_type: gcsfuse - id: compute-hpc-image - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, gcs] settings: machine_type: n2-standard-2 name_prefix: hpc-image - id: wait - source: ./community/modules/scripts/wait-for-startup + source: community/modules/scripts/wait-for-startup settings: instance_name: $(compute-hpc-image.name[0]) timeout: 25 diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 67a15a8437..8f1d40a0ce 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: appsfs source: modules/file-system/filestore @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community/modules/scripts/spack-setup + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -61,7 +61,7 @@ deployment_groups: spack install fftw%intel@18.0.5 target=skylake ^intel-mpi@2018.4.274%intel@18.0.5 target=x86_64 - id: startup - source: ./modules/scripts/startup-script + source: modules/scripts/startup-script settings: install_stackdriver_agent: true runners: @@ -78,7 +78,7 @@ deployment_groups: - $(spack-execute.spack_runner) - id: instance - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml index 7adcc33496..467e0641fa 100644 --- a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml +++ b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml @@ -26,10 +26,10 @@ deployment_groups: - group: primary modules: - id: network1 - source: ./modules/network/pre-existing-vpc + source: modules/network/pre-existing-vpc - id: multi-instance-multi-ssd - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1] settings: machine_type: n2-standard-16 @@ -37,7 +37,7 @@ deployment_groups: local_ssd_count: 2 - id: instance-ssd-interface-defined - source: ./modules/compute/vm-instance + source: modules/compute/vm-instance use: [network1] settings: machine_type: n2-standard-16 diff --git a/tools/validate_configs/test_configs/vm.yaml b/tools/validate_configs/test_configs/vm.yaml index 8de1ce67e2..4941721125 100644 --- a/tools/validate_configs/test_configs/vm.yaml +++ b/tools/validate_configs/test_configs/vm.yaml @@ -30,7 +30,7 @@ deployment_groups: - id: network1 source: modules/network/pre-existing-vpc - - source: ./modules/compute/vm-instance + - source: modules/compute/vm-instance id: compute_instances_family use: [network1] settings: @@ -47,7 +47,7 @@ deployment_groups: # project: $(vars.project_id) # family: myubuntu - - source: ./modules/compute/vm-instance + - source: modules/compute/vm-instance id: compute_instances_name use: [network1] settings: From ca66b8c6e3e55eeb26f90e928cc9c6a43ca46acd Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Wed, 9 Oct 2024 18:47:35 +0000 Subject: [PATCH 074/102] guest_accelerator modifications --- .../compute/htcondor-execute-point/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v5-node-group/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf | 4 ++-- .../compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v5-controller/gpu_definition.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/gpu_definition.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf | 4 ++-- modules/compute/gke-node-pool/gpu_definition.tf | 4 ++-- modules/compute/gke-node-pool/main.tf | 6 +++--- modules/compute/gke-node-pool/reservation_definitions.tf | 2 +- modules/compute/vm-instance/gpu_definition.tf | 4 ++-- modules/compute/vm-instance/main.tf | 2 +- 13 files changed, 25 insertions(+), 25 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ b/modules/compute/gke-node-pool/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index f391532976..f7ef813496 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -23,7 +23,7 @@ locals { sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) - has_gpu = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family + has_gpu = (local.guest_accelerator != null && (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0)) || local.preattached_gpu_machine_family gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" @@ -89,13 +89,13 @@ resource "google_container_node_pool" "node_pool" { image_type = var.image_type dynamic "guest_accelerator" { - for_each = local.guest_accelerator + for_each = { for idx, ga in local.guest_accelerator : idx => ga if ga.count > 0 } content { type = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, "")) count = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0")) gpu_driver_installation_config = coalescelist(try(guest_accelerator.value.gpu_driver_installation_config, []), [{ gpu_driver_version = "DEFAULT" }]) gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, "") - gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, []) + gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, null) } } diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf index d40cc5b01f..a75246b185 100644 --- a/modules/compute/gke-node-pool/reservation_definitions.tf +++ b/modules/compute/gke-node-pool/reservation_definitions.tf @@ -55,7 +55,7 @@ locals { }] nodepool_vm_properties = { "machine_type" : var.machine_type - "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) => coalesce(acc.count, try(local.generated_guest_accelerator[0].count, 0)) }, + "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : (acc.count > 0 ? coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) : "") => acc.count if acc.count > 0 }, "local_ssds" : { "NVME" : coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0), "SCSI" : coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index 6c5d96d286..c6c3944332 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) } diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 683fa77682..01207d701f 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -39,7 +39,7 @@ locals { # compact_placement : true when placement policy is provided and collocation set; false if unset compact_placement = try(var.placement_policy.collocation, null) != null - gpu_attached = contains(["a2", "g2"], local.machine_family) || length(local.guest_accelerator) > 0 + gpu_attached = contains(["a2", "g2"], local.machine_family) || (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0) # both of these must be false if either compact placement or preemptible/spot instances are used # automatic restart is tolerant of GPUs while on host maintenance is not From 12f6d3fedbc59fdc4b86f10ac222b081e3dbb09e Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Fri, 11 Oct 2024 23:50:51 +0000 Subject: [PATCH 075/102] random_id replaced by module_id --- modules/compute/gke-node-pool/README.md | 6 ++---- modules/compute/gke-node-pool/main.tf | 8 +++----- modules/compute/gke-node-pool/metadata.yaml | 2 ++ modules/compute/gke-node-pool/variables.tf | 10 +++++++++- modules/compute/gke-node-pool/versions.tf | 4 ---- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index ec0365556e..797327807c 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -248,7 +248,6 @@ limitations under the License. | [google](#requirement\_google) | ~> 5.0 | | [google-beta](#requirement\_google-beta) | ~> 5.0 | | [null](#requirement\_null) | ~> 3.0 | -| [random](#requirement\_random) | 3.6.3 | ## Providers @@ -257,7 +256,6 @@ limitations under the License. | [google](#provider\_google) | ~> 5.0 | | [google-beta](#provider\_google-beta) | ~> 5.0 | | [null](#provider\_null) | ~> 3.0 | -| [random](#provider\_random) | 3.6.3 | ## Modules @@ -279,7 +277,6 @@ limitations under the License. | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [random_id.nodepool_name_suffix](https://registry.terraform.io/providers/hashicorp/random/3.6.3/docs/resources/id) | resource | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | @@ -301,13 +298,14 @@ limitations under the License. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | +| [ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch](#input\_ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | -| [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type and a suffix with a random string. | `string` | `null` | no | +| [name](#input\_name) | The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix.
If setting manually, ensure a unique value across all gke-node-pools. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index a9b8784a38..6bd6f74274 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -33,20 +33,18 @@ locals { autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 static_node_set = var.static_node_count != null initial_node_set = try(var.initial_node_count > 0, false) + + module_unique_id = replace(lower(var.ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch), "/[^a-z0-9]/", "") } data "google_compute_default_service_account" "default_sa" { project = var.project_id } -resource "random_id" "nodepool_name_suffix" { - byte_length = 8 -} - resource "google_container_node_pool" "node_pool" { provider = google-beta - name = var.name == null ? "${var.machine_type}-${random_id.nodepool_name_suffix.hex}" : var.name + name = var.name == null ? "${var.machine_type}-${local.module_unique_id}" : var.name cluster = var.cluster_id node_locations = var.zones diff --git a/modules/compute/gke-node-pool/metadata.yaml b/modules/compute/gke-node-pool/metadata.yaml index bd1517ce8f..06ab5ae608 100644 --- a/modules/compute/gke-node-pool/metadata.yaml +++ b/modules/compute/gke-node-pool/metadata.yaml @@ -17,3 +17,5 @@ spec: requirements: services: - container.googleapis.com +ghpc: + inject_module_id: ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 069b82393f..c85e524fcc 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -31,11 +31,19 @@ variable "zones" { } variable "name" { - description = "The name of the node pool. If left blank, will default to the machine type and a suffix with a random string." + description = <<-EOD + The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix. + If setting manually, ensure a unique value across all gke-node-pools. + EOD type = string default = null } +variable "ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch" { + description = "Populates with module id (unique blueprint-wide)." + type = string +} + variable "machine_type" { description = "The name of a Google Compute Engine machine type." type = string diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index d3a6076ed9..2a27bfc342 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -28,10 +28,6 @@ terraform { source = "hashicorp/null" version = "~> 3.0" } - random = { - source = "hashicorp/random" - version = "3.6.3" - } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0" From 5d1eed07a88f47f489c58fce02d2a99bc86d9f39 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 9 Oct 2024 20:54:04 +0000 Subject: [PATCH 076/102] Update Slurm-gcp v5 deprecation details --- .../schedmd-slurm-gcp-v5-controller/README.md | 7 +++++++ .../schedmd-slurm-gcp-v5-hybrid/README.md | 8 ++++++++ .../schedmd-slurm-gcp-v5-login/README.md | 7 +++++++ examples/README.md | 15 +++++++-------- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 501f2b0dba..b2a1bb503e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -1,5 +1,12 @@ ## Description +> [!NOTE] +> Slurm-gcp-v5-controller module is deprecated. See +> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations +> and timelines. + +* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) + This module creates a slurm controller node via the [SchedMD/slurm-gcp] [slurm\_controller\_instance] and [slurm\_instance\_template] modules. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 9822d36eab..bc58f82f9c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -1,4 +1,12 @@ ## Description + +> [!NOTE] +> Slurm-gcp-v5-hybrid module is deprecated. See +> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations +> and timelines. + +* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) + This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD as part of the [slurm-gcp] github repository. The hybrid module serves to create the configurations needed to extend an on-premise slurm cluster to one with one diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index a43636cd8c..80d969ade6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -1,5 +1,12 @@ ## Description +> [!NOTE] +> Slurm-gcp-v5-login module is deprecated. See +> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations +> and timelines. + +* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) + This module creates a login node for a Slurm cluster based on the [SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance] terraform modules. The login node is used in conjunction with the diff --git a/examples/README.md b/examples/README.md index 0275fd930c..7dec270823 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,7 +2,7 @@ > [!NOTE] > Migration to Slurm-GCP v6 is completed. See -> [this update](#ongoing-migration-to-slurm-gcp-v6) for specific recommendations +> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations > and timelines. This directory contains a set of example blueprint files that can be fed into @@ -15,7 +15,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Instructions](#instructions) * [(Optional) Setting up a remote terraform state](#optional-setting-up-a-remote-terraform-state) -* [Ongoing Migration to Slurm-GCP v6](#ongoing-migration-to-slurm-gcp-v6) +* [Completed Migration to Slurm-GCP v6](#completed-migration-to-slurm-gcp-v6) * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml-) ![core-badge] * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] @@ -141,7 +141,7 @@ subcommands as well: [configuration block]: https://developer.hashicorp.com/terraform/language/settings/backends/configuration#using-a-backend-block [gcs]: https://developer.hashicorp.com/terraform/language/settings/backends/gcs -## Ongoing Migration to Slurm-GCP v6 +## Completed Migration to Slurm-GCP v6 [Slurm-GCP](https://github.com/GoogleCloudPlatform/slurm-gcp) is the set of scripts and tools that automate the installation, deployment, and certain @@ -150,16 +150,15 @@ Google Cloud Platform. It is recommended to use Slurm-GCP through the Cluster Toolkit where it is exposed as various modules. The Cluster Toolkit team has finished transitioning from Slurm-GCP v5 to Slurm-GCP v6 and -now Slurm-GCP v6 is the recommended option. Following this, blueprint naming would be -as follows: +as of 10/11/2024, Slurm-GCP v6 is the recommended option. Blueprint naming is as +follows: * Slurm v5: hpc-slurm-v5-legacy.yaml * Slurm v6: hpc-slurm.yaml > [!IMPORTANT] -> Three months after Slurm-gcp V6 becomes the recommended version, Slurm v5 -> modules will be marked as deprecated and will be maintained in our repo for -> another three months, at which point the modules will be removed from the Cluster +> Slurm-GCP v5 modules are now marked as deprecated and will be maintained in our +> repo till January 6, 2025. After that, the modules will be removed from the Cluster > Toolkit repo and regression tests will no longer run for V5. Those who choose > to not upgrade to V6 will still be able to use V5 modules by referencing > specific git tags in the module source lines. From f0d7803b1f0e709e584d72fe7d451183755d46ae Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Sat, 12 Oct 2024 00:31:26 +0000 Subject: [PATCH 077/102] module id variable name modified --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/main.tf | 4 ++-- modules/compute/gke-node-pool/metadata.yaml | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 797327807c..efdcb22b95 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -298,8 +298,8 @@ limitations under the License. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | -| [ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch](#input\_ineternal\_filed\_to\_be\_used\_by\_secret\_coven\_of\_mages\_do\_not\_touch) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | +| [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 6bd6f74274..6e16771a3a 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -34,7 +34,7 @@ locals { static_node_set = var.static_node_count != null initial_node_set = try(var.initial_node_count > 0, false) - module_unique_id = replace(lower(var.ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch), "/[^a-z0-9]/", "") + module_unique_id = replace(lower(var.internal_ghpc_module_id), "/[^a-z0-9\\-]/", "") } data "google_compute_default_service_account" "default_sa" { @@ -44,7 +44,7 @@ data "google_compute_default_service_account" "default_sa" { resource "google_container_node_pool" "node_pool" { provider = google-beta - name = var.name == null ? "${var.machine_type}-${local.module_unique_id}" : var.name + name = coalesce(var.name, "${var.machine_type}-${local.module_unique_id}") cluster = var.cluster_id node_locations = var.zones diff --git a/modules/compute/gke-node-pool/metadata.yaml b/modules/compute/gke-node-pool/metadata.yaml index 06ab5ae608..e980d595a2 100644 --- a/modules/compute/gke-node-pool/metadata.yaml +++ b/modules/compute/gke-node-pool/metadata.yaml @@ -18,4 +18,4 @@ spec: services: - container.googleapis.com ghpc: - inject_module_id: ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch + inject_module_id: internal_ghpc_module_id diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index c85e524fcc..a258cf8a47 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -39,7 +39,7 @@ variable "name" { default = null } -variable "ineternal_filed_to_be_used_by_secret_coven_of_mages_do_not_touch" { +variable "internal_ghpc_module_id" { description = "Populates with module id (unique blueprint-wide)." type = string } From e2ea01b946975e784bbc17ad60b82d4ff674d5c5 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Sat, 12 Oct 2024 21:23:04 +0000 Subject: [PATCH 078/102] module id variable decription updated --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index efdcb22b95..d15f644b80 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -299,7 +299,7 @@ limitations under the License. | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | -| [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | Populates with module id (unique blueprint-wide). | `string` | n/a | yes | +| [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide). | `string` | n/a | yes | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index a258cf8a47..f5f31abde0 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -40,7 +40,7 @@ variable "name" { } variable "internal_ghpc_module_id" { - description = "Populates with module id (unique blueprint-wide)." + description = "DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide)." type = string } From d8b287953981153089071c6774b3c1b83e13becb Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Sun, 13 Oct 2024 19:29:21 +0000 Subject: [PATCH 079/102] fix comment --- examples/gke-storage-parallelstore.yaml | 4 +++- modules/file-system/gke-storage/README.md | 4 ++-- modules/file-system/gke-storage/variables.tf | 11 ++++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index daecc6657e..413e523da7 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -24,7 +24,7 @@ vars: authorized_cidr: /32 deployment_groups: -- group: primary +- group: setup modules: - id: network source: modules/network/vpc @@ -43,6 +43,8 @@ deployment_groups: settings: prefix_length: 24 +- group: primary + modules: - id: gke_cluster source: modules/scheduler/gke-cluster use: [network] diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index c578a4a0d8..17c718aa37 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -5,7 +5,7 @@ to dynamically provision GCP storage resources like Parallelstore. ### Example -The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Peresistent Volume Claim, +The following example uses the `gke-storage` module to creates a Parallelstore Storage Class and Persistent Volume Claim, then use them in a `gke-job-template` to dynamically provision the resource. ```yaml @@ -119,7 +119,7 @@ No resources. | [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.
[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
Supported value:
- Retain
- Delete | `string` | n/a | yes | | [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no | | [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.
Supported value:
- Immediate
- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no | -| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore
- Hyperdisk-balanced
- Hyperdisk-throughput
- Hyperdisk-extreme | `string` | n/a | yes | +| [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)
to used. This module currently support dynamic provisioning for the below storage options
- Parallelstore | `string` | n/a | yes | ## Outputs diff --git a/modules/file-system/gke-storage/variables.tf b/modules/file-system/gke-storage/variables.tf index 97ff1af21b..9ad3b839d8 100644 --- a/modules/file-system/gke-storage/variables.tf +++ b/modules/file-system/gke-storage/variables.tf @@ -34,15 +34,12 @@ variable "storage_type" { The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview) to used. This module currently support dynamic provisioning for the below storage options - Parallelstore - - Hyperdisk-balanced - - Hyperdisk-throughput - - Hyperdisk-extreme EOT type = string nullable = false validation { - condition = var.storage_type == null ? false : contains(["parallelstore", "hyperdisk-balanced", "hyperdisk-throughput", "hyperdisk-extreme"], lower(var.storage_type)) - error_message = "Allowed string values for var.storage_type are \"Parallelstore\", \"Hyperdisk-balanced\", \"Hyperdisk-throughput\", \"Hyperdisk-extreme\"." + condition = var.storage_type == null ? false : contains(["parallelstore"], lower(var.storage_type)) + error_message = "Allowed string values for var.storage_type are \"Parallelstore\"." } } @@ -110,6 +107,10 @@ variable "pv_mount_path" { description = "Path within the container at which the volume should be mounted. Must not contain ':'." type = string default = "/data" + validation { + condition = var.pv_mount_path == null ? true : !strcontains(var.pv_mount_path, ":") + error_message = "pv_mount_path must not contain ':', please correct it and retry" + } } variable "mount_options" { From b0217dece2627de4d6c28c9162fa2810d1142279 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 9 Oct 2024 20:28:19 +0000 Subject: [PATCH 080/102] Mark slurm-gcp v5 version of blueprints as deprecated --- examples/README.md | 40 ++++++++++++++++++++-------------------- modules/README.md | 22 +++++++++++----------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/examples/README.md b/examples/README.md index 7dec270823..9bd2c60b1c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -17,36 +17,36 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [(Optional) Setting up a remote terraform state](#optional-setting-up-a-remote-terraform-state) * [Completed Migration to Slurm-GCP v6](#completed-migration-to-slurm-gcp-v6) * [Blueprint Descriptions](#blueprint-descriptions) - * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml-) ![core-badge] + * [hpc-slurm-v5-legacy.yaml](#hpc-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge] * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] - * [hpc-enterprise-slurm-v5-legacy.yaml](#hpc-enterprise-slurm-v5-legacyyaml-) ![core-badge] + * [hpc-enterprise-slurm-v5-legacy.yaml](#hpc-enterprise-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] * [hpc-slurm-static.yaml](#hpc-slurm-staticyaml-) ![core-badge] * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm6-tpu-maxtext.yaml](#hpc-slurm6-tpu-maxtextyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm6-apptainer.yaml](#hpc-slurm6-apptaineryaml--) ![community-badge] ![experimental-badge] - * [ml-slurm-v5-legacy.yaml](#ml-slurm-v5-legacyyaml-) ![core-badge] + * [ml-slurm-v5-legacy.yaml](#ml-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] - * [image-builder-v5-legacy.yaml](#image-builder-v5-legacyyaml-) ![core-badge] + * [image-builder-v5-legacy.yaml](#image-builder-v5-legacyyaml--) ![core-badge] ![deprecated-badge] * [image-builder.yaml](#image-builderyaml--) ![core-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] * [ps-slurm.yaml](#ps-slurmyaml--) ![core-badge] ![experimental-badge] * [pfs-parallelstore.yaml](#pfs-parallelstoreyaml--) ![core-badge] ![experimental-badge] - * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml-) ![core-badge] + * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml--) ![core-badge] ![deprecated-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml-) ![community-badge] + * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml--) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] - * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml-) ![community-badge] + * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml--) ![community-badge] ![experimental-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-local-ssd-v5-legacy.yaml](#hpc-slurm-local-ssd-v5-legacyyaml--) ![community-badge] ![experimental-badge] + * [hpc-slurm-local-ssd-v5-legacy.yaml](#hpc-slurm-local-ssd-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] * [hpc-slurm-local-ssd.yaml](#hpc-slurm-local-ssdyaml--) ![community-badge] ![experimental-badge] * [hcls-blueprint.yaml](#hcls-blueprintyaml-) ![core-badge] * [hpc-gke.yaml](#hpc-gkeyaml--) ![core-badge] ![experimental-badge] @@ -54,14 +54,14 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [storage-gke](#storage-gkeyaml--) ![core-badge] ![experimental-badge] * [gke-a3-megagpu](#gke-a3-megagpuyaml--) ![core-badge] ![experimental-badge] * [gke-a3-highgpu](#gke-a3-highgpuyaml--) ![core-badge] ![experimental-badge] - * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml--) ![community-badge] ![experimental-badge] + * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] * [fsi-montecarlo-on-batch.yaml](#fsi-montecarlo-on-batchyaml-) ![community-badge] ![experimental-badge] * [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge] * [tutorial-starccm.yaml](#tutorial-starccmyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-ramble-gromacs.yaml](#hpc-slurm-ramble-gromacsyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-chromedesktop-v5-legacy.yaml](#hpc-slurm-chromedesktop-v5-legacyyaml--) ![community-badge] ![experimental-badge] + * [hpc-slurm-chromedesktop-v5-legacy.yaml](#hpc-slurm-chromedesktop-v5-legacyyaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge] * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] * [omnia-cluster.yaml](#omnia-clusteryaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] @@ -210,7 +210,7 @@ Toolkit team, partners, etc.) and are labeled with the community badge Blueprints that are still in development and less stable are also labeled with the experimental badge (![experimental-badge]). -### [hpc-slurm-v5-legacy.yaml] ![core-badge] +### [hpc-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge] > **Warning**: The variables `enable_reconfigure`, > `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to @@ -319,7 +319,7 @@ For this example the following is needed in the selected region: * Compute Engine API: Resource policies: **one for each job in parallel** - _only needed for the `compute` partition_ -### [hpc-enterprise-slurm-v5-legacy.yaml] ![core-badge] +### [hpc-enterprise-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge] This advanced blueprint creates a cluster with Slurm with several performance tunings enabled, along with tiered file systems for higher performance. Some of @@ -551,7 +551,7 @@ This blueprint creates a custom [Apptainer](https:https://apptainer.org) enabled [hpc-slurm6-apptainer.yaml]: ../community/examples/hpc-slurm6-apptainer.yaml -### [ml-slurm-v5-legacy.yaml] ![core-badge] +### [ml-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge] This blueprint provisions an HPC cluster running the Slurm scheduler with the machine learning frameworks PyTorch and TensorFlow pre-installed on every @@ -649,7 +649,7 @@ timestamp for uniqueness. [ml-slurm.yaml]: ../examples/ml-slurm.yaml -### [image-builder-v5-legacy.yaml] ![core-badge] +### [image-builder-v5-legacy.yaml] ![core-badge] ![deprecated-badge] This blueprint uses the [Packer template module][pkr] to create a custom VM image and uses it to provision an HPC cluster using the Slurm scheduler. By @@ -1058,7 +1058,7 @@ For this example the following is needed in the selected region: [pfs-parallelstore.yaml]: ./pfs-parallelstore.yaml [Parallelstore]: ../modules/file-system/parallelstore/README.md -### [cae-slurm-v5-legacy.yaml] ![core-badge] +### [cae-slurm-v5-legacy.yaml] ![core-badge] ![deprecated-badge] The Computer Aided Engineering (CAE) blueprint captures a reference architecture where the right cloud components are assembled to optimally cater to the @@ -1143,7 +1143,7 @@ The blueprint contains 3 groups: [hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml -### [hpc-slurm-ubuntu2004-v5-legacy.yaml] ![community-badge] +### [hpc-slurm-ubuntu2004-v5-legacy.yaml] ![community-badge] ![deprecated-badge] > **Warning**: The variables `enable_reconfigure`, > `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to @@ -1230,7 +1230,7 @@ examples][intel-examples-readme]. [hpc-slurm-daos.yaml]: ../community/examples/intel/hpc-slurm-daos.yaml -### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] +### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] ![deprecated-badge] This example provisions a Slurm cluster using AMD VM machine types. It automates the initial setup of Spack, including a script that can be used to @@ -1398,7 +1398,7 @@ the nodes are provisioned. All nodes mount a filestore instance on `/home`. [omnia-github]: https://github.com/dellhpc/omnia [omnia-cluster.yaml]: ../community/examples/omnia-cluster.yaml -### [hpc-slurm-local-ssd-v5-legacy.yaml] ![community-badge] ![experimental-badge] +### [hpc-slurm-local-ssd-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge] This blueprint demonstrates the use of Slurm and Filestore, with the definition of a partition which deploys compute nodes that have local ssd drives deployed. @@ -1594,7 +1594,7 @@ walks through the use of this blueprint. [htc-htcondor.yaml]: ../community/examples/htc-htcondor.yaml [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -### [htc-slurm-v5-legacy.yaml] ![community-badge] ![experimental-badge] +### [htc-slurm-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge] This blueprint provisions a cluster using the Slurm scheduler in a configuration tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs. @@ -1659,7 +1659,7 @@ tutorial. [tutorial-fluent.yaml]: ../community/examples/tutorial-fluent.yaml -### [hpc-slurm-chromedesktop-v5-legacy.yaml] ![community-badge] ![experimental-badge] +### [hpc-slurm-chromedesktop-v5-legacy.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge] This example shows how to use the `chrome-remote-desktop` module with a Slurm partition to be able to `salloc` a GPU accelerated remote desktop. diff --git a/modules/README.md b/modules/README.md index defba11446..c562c111cd 100644 --- a/modules/README.md +++ b/modules/README.md @@ -35,17 +35,17 @@ Modules that are still in development and less stable are labeled with the ### Compute * **[vm-instance]** ![core-badge] : Creates one or more VM instances. -* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] : +* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] ![deprecated-badge] : Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v5-controller]. -* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] : +* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] ![deprecated-badge]: Creates a node group to be used by the [schedmd-slurm-gcp-v5-partition] module. -* **[schedmd-slurm-gcp-v6-partition]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-partition]** ![core-badge] : Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v6-controller]. -* **[schedmd-slurm-gcp-v6-nodeset]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-nodeset]** ![core-badge] : Creates a nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module. -* **[schedmd-slurm-gcp-v6-nodeset-tpu]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-nodeset-tpu]** ![core-badge] : Creates a TPU nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module. -* **[schedmd-slurm-gcp-v6-nodeset-dynamic]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-nodeset-dynamic]** ![core-badge] ![experimental-badge]: Creates a dynamic nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module and instance template. * **[gke-node-pool]** ![core-badge] ![experimental-badge] : Creates a Kubernetes node pool using GKE. @@ -194,15 +194,15 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca * **[gke-cluster]** ![core-badge] ![experimental-badge] : Creates a Kubernetes cluster using GKE. * **[pre-existing-gke-cluster]** ![core-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module. -* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] : +* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] ![deprecated-badge] : Creates a Slurm controller node using [slurm-gcp-version-5]. -* **[schedmd-slurm-gcp-v5-login]** ![community-badge] : +* **[schedmd-slurm-gcp-v5-login]** ![community-badge] ![deprecated-badge] : Creates a Slurm login node using [slurm-gcp-version-5]. -* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] : +* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] ![deprecated-badge] : Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. -* **[schedmd-slurm-gcp-v6-controller]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-controller]** ![core-badge] : Creates a Slurm controller node using [slurm-gcp-version-6]. -* **[schedmd-slurm-gcp-v6-login]** ![community-badge] ![experimental-badge]: +* **[schedmd-slurm-gcp-v6-login]** ![core-badge] : Creates a Slurm login node using [slurm-gcp-version-6]. * **[htcondor-setup]** ![community-badge] ![experimental-badge] : Creates the base infrastructure for an HTCondor pool (service accounts and Cloud Storage bucket). From 7f21690841f3247d2711e3c9df4f7cbbcc6c2775 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Thu, 10 Oct 2024 18:39:29 +0000 Subject: [PATCH 081/102] Update a3-high NeMo version 23.11 to 24.07 --- .../machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile | 4 ++-- .../machine-learning/a3-highgpu-8g/nemo-framework/README.md | 4 ++-- .../a3-highgpu-8g/nemo-framework/setup_nemo.sh | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile index 3f6196a45e..c693c2d7cb 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG NEMOFW_VERSION=23.11 -FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION}.framework +ARG NEMOFW_VERSION=24.07 +FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION} ENV USE_TCPX=yes ENV NCCL_NET=GPUDirectTCPX_v7 diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md index 37ecdd9600..4440dff882 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md @@ -3,7 +3,7 @@ README 1. Set up NeMo Framework Container - This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:23.11.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) + This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) container, and submits a Slurm job to copy the framework launcher scripts and a few other auxiliary files into your working directory. @@ -45,7 +45,7 @@ README launcher_scripts_path=${PWD} \ stages=[training] \ env_vars.TRANSFORMERS_OFFLINE=0 \ - container=../nemofw+tcpx-23.11.sqsh \ + container=../nemofw+tcpx-24.07.sqsh \ container_mounts=[${HOME}/.cache,"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"] \ cluster.srun_args=["--container-writable"] \ training.model.data.data_impl=mock \ diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh index 008c0f21c2..5692b0342b 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/setup_nemo.sh @@ -18,7 +18,7 @@ #SBATCH --partition=a3 #SBATCH --exclusive -: "${NEMOFW_VERSION:=23.11}" +: "${NEMOFW_VERSION:=24.07}" srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemofw:tcpx-"${NEMOFW_VERSION}" . srun rm -f nemofw+tcpx-"${NEMOFW_VERSION}".sqsh @@ -27,4 +27,4 @@ srun enroot import dockerd://nemofw:tcpx-"${NEMOFW_VERSION}" srun \ --container-mounts="${PWD}":/workspace/mount_dir,/var/tmp:/var/tmp \ --container-image=./nemofw+tcpx-"${NEMOFW_VERSION}".sqsh \ - bash -c "cp -r /opt/NeMo-Megatron-Launcher/requirements.txt /opt/NeMo-Megatron-Launcher/launcher_scripts /opt/NeMo-Megatron-Launcher/auto_configurator /workspace/mount_dir/" + bash -c "cp -r /opt/NeMo-Framework-Launcher/requirements.txt /opt/NeMo-Framework-Launcher/launcher_scripts /opt/NeMo-Framework-Launcher/auto_configurator /workspace/mount_dir/" From b2c0de6e3af73ab82cd15c7e67bff9bc690ba73e Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Mon, 14 Oct 2024 19:25:20 +0000 Subject: [PATCH 082/102] update container_mounts to work with hydra quote rules --- .../machine-learning/a3-highgpu-8g/nemo-framework/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md index 4440dff882..9eb9252106 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md @@ -46,7 +46,7 @@ README stages=[training] \ env_vars.TRANSFORMERS_OFFLINE=0 \ container=../nemofw+tcpx-24.07.sqsh \ - container_mounts=[${HOME}/.cache,"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"] \ + container_mounts='['${HOME}/.cache',"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"]' \ cluster.srun_args=["--container-writable"] \ training.model.data.data_impl=mock \ training.model.data.data_prefix=[] \ From 64646bcd885a659de2db4d2bb02c40fbdd654130 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Mon, 14 Oct 2024 19:28:38 +0000 Subject: [PATCH 083/102] update a3-high scripts with latest recommended values --- .../a3-highgpu-8g/nccl-tests/run-nccl-tests.sh | 8 +++----- .../nccl-tests/run-topological-nccl-tests.sh | 8 +++----- .../a3-highgpu-8g/nemo-framework/Dockerfile | 9 ++++----- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh index cbc80a3763..988dc5df3e 100644 --- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh +++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-nccl-tests.sh @@ -51,8 +51,6 @@ if [[ ${USE_TCPX} = "yes" ]]; then export NCCL_PROTO=Simple export NCCL_NSOCKS_PERTHREAD=4 export NCCL_SOCKET_NTHREADS=1 - export NCCL_MAX_NCHANNELS=12 - export NCCL_MIN_NCHANNELS=12 export NCCL_DYNAMIC_CHUNK_SIZE=524288 export NCCL_P2P_NET_CHUNKSIZE=524288 export NCCL_P2P_PCI_CHUNKSIZE=524288 @@ -62,9 +60,9 @@ if [[ ${USE_TCPX} = "yes" ]]; then export NCCL_NET_GDR_LEVEL=PIX export NCCL_P2P_PXN_LEVEL=0 export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} - export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000 - export NCCL_GPUDIRECTTCPX_FORCE_ACK=0 - export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} else diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh index 4177a3e184..d42cda9404 100644 --- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh +++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/run-topological-nccl-tests.sh @@ -52,8 +52,6 @@ if [[ ${USE_TCPX} = "yes" ]]; then export NCCL_PROTO=Simple export NCCL_NSOCKS_PERTHREAD=4 export NCCL_SOCKET_NTHREADS=1 - export NCCL_MAX_NCHANNELS=12 - export NCCL_MIN_NCHANNELS=12 export NCCL_DYNAMIC_CHUNK_SIZE=524288 export NCCL_P2P_NET_CHUNKSIZE=524288 export NCCL_P2P_PCI_CHUNKSIZE=524288 @@ -63,9 +61,9 @@ if [[ ${USE_TCPX} = "yes" ]]; then export NCCL_NET_GDR_LEVEL=PIX export NCCL_P2P_PXN_LEVEL=0 export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH} - export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000 - export NCCL_GPUDIRECTTCPX_FORCE_ACK=0 - export NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH} else diff --git a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile index c693c2d7cb..a4264709bc 100644 --- a/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile +++ b/examples/machine-learning/a3-highgpu-8g/nemo-framework/Dockerfile @@ -25,8 +25,6 @@ ENV NCCL_ALGO=Ring ENV NCCL_PROTO=Simple ENV NCCL_NSOCKS_PERTHREAD=4 ENV NCCL_SOCKET_NTHREADS=1 -ENV NCCL_MAX_NCHANNELS=12 -ENV NCCL_MIN_NCHANNELS=12 ENV NCCL_DYNAMIC_CHUNK_SIZE=524288 ENV NCCL_P2P_NET_CHUNKSIZE=524288 ENV NCCL_P2P_PCI_CHUNKSIZE=524288 @@ -36,9 +34,10 @@ ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ENV NCCL_NET_GDR_LEVEL=PIX ENV NCCL_P2P_PXN_LEVEL=0 ENV NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=/run/tcpx -ENV NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=1000000 -ENV NCCL_GPUDIRECTTCPX_FORCE_ACK=0 -ENV NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 +ENV NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 +ENV NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177" +ENV NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191" + RUN echo "/var/lib/tcpx/lib64" >> /etc/ld.so.conf.d/tcpx.conf && ldconfig ENV LD_LIBRARY_PATH=/var/lib/tcpx/lib64:$LD_LIBRARY_PATH From d32ca70d4ac176925b7f6642e343dcf411c386d0 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Mon, 14 Oct 2024 19:30:00 +0000 Subject: [PATCH 084/102] mount /var/tmp for build nccl tests --- .../a3-highgpu-8g/nccl-tests/build-nccl-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh b/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh index 6d669de4a9..11f18eee21 100644 --- a/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh +++ b/examples/machine-learning/a3-highgpu-8g/nccl-tests/build-nccl-tests.sh @@ -25,7 +25,7 @@ set -x CONTAINER_IMAGE=./nvidia+pytorch+23.10-py3.sqsh # Install nccl-tests using openmpi from within pytorch container -srun --container-mounts="$PWD:/nccl" \ +srun --container-mounts="$PWD:/nccl,/var/tmp:/var/tmp" \ --container-image=${CONTAINER_IMAGE} \ --container-name="nccl" \ bash -c " From 6f2bc8ab63ee3f3c76f44eb10952678a45be883f Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 11 Oct 2024 18:04:24 +0000 Subject: [PATCH 085/102] Add mount parallelstore service to mount parallelstore for every reboot --- .../parallelstore/scripts/mount-daos.sh | 25 ++++++++++++++++++- .../scripts/mount-daos.sh | 25 ++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh index 2b09f2e6d4..bb64c9a4d3 100644 --- a/modules/file-system/parallelstore/scripts/mount-daos.sh +++ b/modules/file-system/parallelstore/scripts/mount-daos.sh @@ -48,6 +48,7 @@ if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION mkdir -p /var/log/daos_agent chown daos_agent:daos_agent /var/log/daos_agent sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config + systemctl enable daos_agent.service systemctl start daos_agent.service elif { [ "${OS_ID}" = "ubuntu" ] && [ "${OS_VERSION}" = "22.04" ]; } || { [ "${OS_ID}" = "debian" ] && [ "${OS_VERSION_MAJOR}" = "12" ]; }; then mkdir -p /var/run/daos_agent @@ -73,7 +74,7 @@ for i in {1..10}; do # shellcheck disable=SC2086 dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break - echo "dfuse failed, retrying in 1 seconds (attempt $i/5)..." + echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..." sleep 1 done @@ -81,4 +82,26 @@ if ! mountpoint -q "$local_mount"; then exit 1 fi +# Store the mounting logic in a variable +mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done' + +# --- Begin: Add systemd service creation --- +cat >/usr/lib/systemd/system/mount_parallelstore.service </usr/lib/systemd/system/mount_parallelstore.service < Date: Mon, 14 Oct 2024 23:35:50 +0000 Subject: [PATCH 086/102] Create and use non-default service accounts in GKE --- examples/gke-a3-highgpu.yaml | 16 ++++- examples/gke-a3-megagpu.yaml | 16 ++++- examples/hpc-gke.yaml | 16 ++++- examples/ml-gke.yaml | 17 +++++- examples/storage-gke.yaml | 30 +++++++++- modules/compute/gke-node-pool/README.md | 7 --- modules/compute/gke-node-pool/main.tf | 45 -------------- modules/scheduler/gke-cluster/README.md | 8 +-- modules/scheduler/gke-cluster/main.tf | 48 ++------------- .../daily-tests/blueprints/ml-gke-e2e.yaml | 58 +++++++++++++++++-- .../daily-tests/builds/gke-a3-megagpu.yaml | 1 + .../daily-tests/builds/gke-storage.yaml | 1 + tools/cloud-build/daily-tests/builds/gke.yaml | 1 + .../daily-tests/builds/ml-gke-e2e.yaml | 1 + .../daily-tests/builds/ml-gke.yaml | 1 + 15 files changed, 148 insertions(+), 118 deletions(-) diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index 44f5a8ff33..f7f4018b0d 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -40,6 +40,18 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gpunets source: modules/network/multivpc settings: @@ -50,7 +62,7 @@ deployment_groups: - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1, gpunets] + use: [network1, gpunets, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs master_authorized_networks: @@ -60,7 +72,7 @@ deployment_groups: - id: a3_highgpu_pool source: modules/compute/gke-node-pool - use: [gke_cluster, gpunets] + use: [gke_cluster, gpunets, gke_service_account] settings: machine_type: a3-highgpu-8g autoscaling_total_min_nodes: 2 diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 56ea759b5d..30edb3974c 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -40,6 +40,18 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gpunets source: modules/network/multivpc settings: @@ -50,7 +62,7 @@ deployment_groups: - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1, gpunets] + use: [network1, gpunets, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs master_authorized_networks: @@ -60,7 +72,7 @@ deployment_groups: - id: a3_megagpu_pool source: modules/compute/gke-node-pool - use: [gke_cluster, gpunets] + use: [gke_cluster, gpunets, gke_service_account] settings: machine_type: a3-megagpu-8g autoscaling_total_min_nodes: 2 diff --git a/examples/hpc-gke.yaml b/examples/hpc-gke.yaml index dccdee033b..f927fd8169 100644 --- a/examples/hpc-gke.yaml +++ b/examples/hpc-gke.yaml @@ -35,16 +35,28 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-service-account + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1] + use: [network1, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs outputs: [instructions] - id: compute_pool source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, gke_service_account] - id: job-template source: modules/compute/gke-job-template diff --git a/examples/ml-gke.yaml b/examples/ml-gke.yaml index 5aedd354fb..cbce0a6c1a 100644 --- a/examples/ml-gke.yaml +++ b/examples/ml-gke.yaml @@ -40,19 +40,32 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1] + use: [network1, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs master_authorized_networks: - display_name: deployment-machine cidr_block: $(vars.authorized_cidr) + configure_workload_identity_sa: true outputs: [instructions] - id: g2_pool source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, gke_service_account] settings: disk_type: pd-balanced machine_type: g2-standard-4 diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml index cd46c2d9c3..00c3d60290 100644 --- a/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -38,9 +38,33 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + + - id: local_ssd_pool_service_account + source: community/modules/project/service-account + settings: + name: ssd-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1] + use: [network1, gke_service_account] settings: enable_filestore_csi: true enable_gcsfuse_csi: true @@ -53,7 +77,7 @@ deployment_groups: - id: debug_pool source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, gke_service_account] settings: name: debug zones: [$(vars.zone)] @@ -118,7 +142,7 @@ deployment_groups: - id: local-ssd-pool source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, local_ssd_pool_service_account] settings: name: local-ssd machine_type: n2d-standard-2 diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index d15f644b80..880e1834e4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -268,16 +268,9 @@ limitations under the License. | Name | Type | |------|------| | [google-beta_google_container_node_pool.node_pool](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource | -| [google_project_iam_member.node_service_account_artifact_registry](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_gcr](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_log_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_reservation.specific_reservations](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | ## Inputs diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 6e16771a3a..d14801ccf3 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -20,8 +20,6 @@ locals { } locals { - sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email - preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) has_gpu = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family gpu_taint = local.has_gpu ? [{ @@ -37,10 +35,6 @@ locals { module_unique_id = replace(lower(var.internal_ghpc_module_id), "/[^a-z0-9\\-]/", "") } -data "google_compute_default_service_account" "default_sa" { - project = var.project_id -} - resource "google_container_node_pool" "node_pool" { provider = google-beta @@ -239,45 +233,6 @@ resource "google_container_node_pool" "node_pool" { } } -# For container logs to show up under Cloud Logging and GKE metrics to show up -# on Cloud Monitoring console, some project level roles are needed for the -# node_service_account -resource "google_project_iam_member" "node_service_account_log_writer" { - project = var.project_id - role = "roles/logging.logWriter" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_metric_writer" { - project = var.project_id - role = "roles/monitoring.metricWriter" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_monitoring_viewer" { - project = var.project_id - role = "roles/monitoring.viewer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_resource_metadata_writer" { - project = var.project_id - role = "roles/stackdriver.resourceMetadata.writer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_gcr" { - project = var.project_id - role = "roles/storage.objectViewer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_artifact_registry" { - project = var.project_id - role = "roles/artifactregistry.reader" - member = "serviceAccount:${local.sa_email}" -} - resource "null_resource" "install_dependencies" { provisioner "local-exec" { command = "pip3 install pyyaml argparse" diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 583af203da..2a96a90c5d 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -131,14 +131,8 @@ limitations under the License. |------|------| | [google-beta_google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_cluster) | resource | | [google-beta_google_container_node_pool.system_node_pools](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource | -| [google_project_iam_member.node_service_account_artifact_registry](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_gcr](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_log_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | -| [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | -| [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | ## Inputs diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 480d5b7d58..ac39cebe73 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -29,7 +29,8 @@ locals { security_group = var.authenticator_security_group }] - sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email + default_sa_email = "${data.google_project.project.number}-compute@developer.gserviceaccount.com" + sa_email = coalesce(var.service_account_email, local.default_sa_email) # additional VPCs enable multi networking derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0) @@ -38,8 +39,8 @@ locals { derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking) } -data "google_compute_default_service_account" "default_sa" { - project = var.project_id +data "google_project" "project" { + project_id = var.project_id } resource "google_container_cluster" "gke_cluster" { @@ -267,45 +268,6 @@ resource "google_container_node_pool" "system_node_pools" { } } -# For container logs to show up under Cloud Logging and GKE metrics to show up -# on Cloud Monitoring console, some project level roles are needed for the -# node_service_account -resource "google_project_iam_member" "node_service_account_log_writer" { - project = var.project_id - role = "roles/logging.logWriter" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_metric_writer" { - project = var.project_id - role = "roles/monitoring.metricWriter" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_monitoring_viewer" { - project = var.project_id - role = "roles/monitoring.viewer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_resource_metadata_writer" { - project = var.project_id - role = "roles/stackdriver.resourceMetadata.writer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_gcr" { - project = var.project_id - role = "roles/storage.objectViewer" - member = "serviceAccount:${local.sa_email}" -} - -resource "google_project_iam_member" "node_service_account_artifact_registry" { - project = var.project_id - role = "roles/artifactregistry.reader" - member = "serviceAccount:${local.sa_email}" -} - data "google_client_config" "default" {} provider "kubernetes" { @@ -327,7 +289,7 @@ module "workload_identity" { # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/1059 depends_on = [ - data.google_compute_default_service_account.default_sa, + data.google_project.project, google_container_cluster.gke_cluster ] } diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index 6e64a667a1..20f5ff19f5 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -40,9 +40,21 @@ deployment_groups: - range_name: services ip_cidr_range: 10.0.32.0/20 + - id: gke_service_account + source: community/modules/project/service-account + settings: + name: gke-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: gke_cluster source: modules/scheduler/gke-cluster - use: [network1] + use: [network1, gke_service_account] settings: enable_private_endpoint: false # Allows for access from authorized public IPs master_authorized_networks: @@ -52,7 +64,7 @@ deployment_groups: - id: g2_latest_driver source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, gke_service_account] settings: name: g2-latest-driver machine_type: g2-standard-4 @@ -80,9 +92,21 @@ deployment_groups: ] outputs: [instructions] + - id: n1_service_account + source: community/modules/project/service-account + settings: + name: n1-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: n1_pool_default source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, n1_service_account] settings: name: n1-pool-default disk_type: pd-balanced @@ -108,9 +132,21 @@ deployment_groups: ] outputs: [instructions] + - id: n1_full_service_account + source: community/modules/project/service-account + settings: + name: n1-full-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: n1_pool_full_spec source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, n1_full_service_account] settings: name: n1-pool-full-spec disk_type: pd-balanced @@ -141,9 +177,21 @@ deployment_groups: ] outputs: [instructions] + - id: default_settings_service_account + source: community/modules/project/service-account + settings: + name: ds-sa + project_roles: + - logging.logWriter + - monitoring.metricWriter + - monitoring.viewer + - stackdriver.resourceMetadata.writer + - storage.objectViewer + - artifactregistry.reader + - id: default_settings_pool source: modules/compute/gke-node-pool - use: [gke_cluster] + use: [gke_cluster, default_settings_service_account] settings: name: default-settings-pool diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index 118704e7ea..05c5ce1097 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -16,6 +16,7 @@ tags: - m.gke-cluster - m.gke-node-pool +- m.service-account - m.vpc - m.multivpc - m.kubectl-apply diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 16d8b92587..1e4a11998a 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -17,6 +17,7 @@ tags: - m.cloud-storage-bucket - m.filestore - m.gke-cluster +- m.service-account - m.gke-job-template - m.gke-node-pool - m.gke-persistent-volume diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 709a2b5c1b..b73409a94f 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -17,6 +17,7 @@ tags: - m.gke-cluster - m.gke-job-template - m.gke-node-pool +- m.service-account - m.vpc - gke diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml index 4b04ceb7d0..caeeee66fa 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml @@ -17,6 +17,7 @@ tags: - m.gke-cluster - m.gke-job-template - m.gke-node-pool +- m.service-account - m.vpc - gke diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index c9ae96850f..a3b83c6fa8 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -17,6 +17,7 @@ tags: - m.gke-cluster - m.gke-job-template - m.gke-node-pool +- m.service-account - m.vpc - gke From 69d67465825647e88b0fc96b9ee4030e314f1166 Mon Sep 17 00:00:00 2001 From: ighosh98 Date: Tue, 15 Oct 2024 07:25:09 +0000 Subject: [PATCH 087/102] GKE A3 high integration test --- .../daily-tests/builds/gke-a3-highgpu.yaml | 66 +++++++++++++++++++ .../daily-tests/tests/gke-a3-highgpu.yml | 43 ++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml create mode 100644 tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml new file mode 100644 index 0000000000..2ad20f6b8d --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml @@ -0,0 +1,66 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-node-pool +- m.vpc +- m.multivpc +- m.service-account +- m.kubectl-apply +- gke + +timeout: 14400s # 4hr +steps: +- id: gke-a3-highgpu + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + EXAMPLE_BP=examples/gke-a3-highgpu.yaml + + # Replacing the static subnet name to prevent collisions + sed -i "s/gke-subnet-a3-highgpu/gke-subnet-a3-highgpu-$${BUILD_ID_SHORT}/" $${EXAMPLE_BP} + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${EXAMPLE_BP} + echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP} + echo ' use: [network1]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP} + echo ' name_prefix: remote-node' >> $${EXAMPLE_BP} + echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} + echo '' + echo ' - id: job_template_hostname' >> $${EXAMPLE_BP} + echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP} + echo ' use: [a3_highgpu_pool]' >> $${EXAMPLE_BP} + echo ' settings:' >> $${EXAMPLE_BP} + echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP} + echo ' command:' >> $${EXAMPLE_BP} + echo ' - nvidia-smi' >> $${EXAMPLE_BP} + echo ' node_count: 1' >> $${EXAMPLE_BP} + echo ' outputs: [instructions]' >> $${EXAMPLE_BP} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml new file mode 100644 index 0000000000..26b894a6fe --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# region, zone must be defined +# in build file with --extra-vars flag! +test_name: gke-a3high +deployment_name: gke-a3high-{{ build }} +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/gke-a3-highgpu.yaml" +network: "gke-a3high-net-{{ build }}" +region: us-west1 +zone: us-west1-a +remote_node: "{{ deployment_name }}-remote-node-0" +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: a3-reservation-0 + project: "{{ project }}" +cli_deployment_vars: + region: "{{ region }}" + zone: "{{ zone }}" + reservation_affinity: "{{ reservation_affinity }}" + autoscaling_total_max_nodes: 2 + authorized_cidr: "{{ build_ip.stdout }}/32" + network_name: "{{ network }}" + local_ssd_count_nvme_block: 16 +custom_vars: + project: "{{ project }}" +post_deploy_tests: +- test-validation/test-gke-job.yml From 31f13b9f13f57004acf157d09703f7484a2788f9 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 15 Oct 2024 09:51:14 -0400 Subject: [PATCH 088/102] Update modules/scripts/startup-script/README.md Co-authored-by: Tom Downes --- modules/scripts/startup-script/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index b9cae7fdee..3d2f7cbcda 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -162,6 +162,7 @@ curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh sudo bash add-logging-agent-repo.sh --also-install sudo service stackdriver-agent start ``` + #### Cloud Ops Agent Installation If an image or machine already has the Stackdriver Agent installed and you would From 92cddfdf09306f5bd41848d08f14595785cc8bd0 Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 15 Oct 2024 09:51:23 -0400 Subject: [PATCH 089/102] Update modules/scripts/startup-script/README.md Co-authored-by: Tom Downes --- modules/scripts/startup-script/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 3d2f7cbcda..a32c2c5509 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -186,7 +186,7 @@ sudo bash add-google-cloud-ops-agent-repo.sh --also-install sudo service google-cloud-ops-agent start ``` -As a reminder, this should be in a startup script, which should run on all +As a reminder, this should be in a startup script, which should run on all Compute nodes via the `compute_startup_script` on the controller. #### Testing Installation From 8eedf55701fec2bca8533c0210bf0242821ce758 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Thu, 10 Oct 2024 18:37:14 +0000 Subject: [PATCH 090/102] improve dws_flex ux --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 1 + .../schedmd-slurm-gcp-v6-nodeset/outputs.tf | 9 +++++++ .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 27 +++++++++++++++++++ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 8 ++++++ .../partition.tf | 1 + .../variables.tf | 4 +++ docs/slurm-dws-flex.md | 18 +++++-------- 9 files changed, 58 insertions(+), 13 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 117e0ca0e5..115ac451e7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -169,6 +169,7 @@ No modules. | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups; |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
})
|
{
"enabled": false
}
| no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 224ca76f80..217328277b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -57,6 +57,7 @@ locals { node_count_dynamic_max = var.node_count_dynamic_max node_conf = var.node_conf nodeset_name = local.name + dws_flex = var.dws_flex disk_auto_delete = var.disk_auto_delete disk_labels = merge(local.labels, var.disk_labels) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index dc2f3b0c40..671d542584 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -44,4 +44,13 @@ output "nodeset" { condition = !var.enable_placement || var.node_count_static == 0 || var.node_count_dynamic_max == 0 error_message = "Cannot use placement with static and auto-scaling nodes in the same node set." } + precondition { + condition = var.reservation_name == "" || !var.dws_flex.enabled + error_message = "Cannot use reservations with DWS Flex." + } + + precondition { + condition = !var.enable_placement || !var.dws_flex.enabled + error_message = "Cannot use DWS Flex with `enable_placement`." + } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index aeb2435bd0..536659f136 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -512,3 +512,30 @@ variable "enable_maintenance_reservation" { description = "Enables slurm reservation for scheduled maintenance." default = false } + +variable "dws_flex" { + description = <<-EOD + If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes. + See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler + Options: + - enable: Enable DWS Flex Start + - max_run_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks). + + Limitations: + - CAN NOT be used with reservations; + - CAN NOT be used with placement groups; + + EOD + + type = object({ + enabled = optional(bool, true) + max_run_duration = optional(number, 1209600) # 2 weeks + }) + default = { + enabled = false + } + validation { + condition = var.dws_flex.max_run_duration >= 30 && var.dws_flex.max_run_duration <= 1209600 + error_message = "Max duration must be more than 30 seconds, and cannot be more than two weeks." + } +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index a9d801d8c7..1720eb67a5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -313,7 +313,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index bad3f662f0..1bc1150c58 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -98,11 +98,19 @@ def instance_properties(nodeset, model, placement_group, labels=None): if nodeset.maintenance_interval: props.scheduling.maintenanceInterval = nodeset.maintenance_interval + if nodeset.dws_flex.enabled: + update_props_dws(props,nodeset.dws_flex) + # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) return props +def update_props_dws(props:dict,dws_flex:dict) -> None: + props.scheduling.onHostMaintenance = "TERMINATE" + props.scheduling.instanceTerminationAction = "DELETE" + props.scheduling.maxRunDuration['seconds'] = dws_flex.max_run_duration + props.reservationAffinity['consumeReservationType'] = "NO_RESERVATION" def per_instance_properties(node): props = NSDict() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 849844808a..7254551072 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -83,6 +83,7 @@ locals { nodesets = [for name, ns in local.nodeset_map : { nodeset_name = ns.nodeset_name node_conf = ns.node_conf + dws_flex = ns.dws_flex instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link node_count_dynamic_max = ns.node_count_dynamic_max node_count_static = ns.node_count_static diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 2fc7bebb4b..95e5c20d0a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -212,6 +212,10 @@ variable "nodeset" { count = number type = string })) + dws_flex = object({ + enabled = bool + max_run_duration = number + }) labels = optional(map(string), {}) machine_type = optional(string) maintenance_interval = optional(string) diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md index dfa65b6015..8b1c38bb01 100644 --- a/docs/slurm-dws-flex.md +++ b/docs/slurm-dws-flex.md @@ -13,25 +13,19 @@ With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity re > The project needs to be allowlisted for private preview access. > Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). -In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: +In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. See the example below: ```yaml - id: flex_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network] settings: - instance_properties: - reservationAffinity: - consumeReservationType: NO_RESERVATION - scheduling: - maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours - onHostMaintenance: TERMINATE - instanceTerminationAction: DELETE + dws_flex: + max_run_duration: 3600 # 1 hour + enable_placement: false # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc. ``` -**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days). - > [!WARNING] -> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample -> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. +> DWS Flex Start cannot be used in tandem with a reservation or placement policy +> While this feature was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. From 2fa185ccf920f53017374284599d329778ca0f26 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 15 Oct 2024 13:30:26 -0500 Subject: [PATCH 091/102] Update Slurm-GCP to 6.8.2 Brings in new default NVIDIA driver 550.90.12 which solves several known issues, including NCCL Timeout errors. https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-12/index.html --- community/examples/hpc-build-slurm-image.yaml | 2 +- .../README.md | 2 +- .../main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 18 +++++++++--------- .../controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-login/README.md | 8 ++++---- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- modules/README.md | 2 +- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 45e6bd1612..a1fa81767e 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -23,7 +23,7 @@ vars: image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public - build_from_git_ref: 6.7.0 + build_from_git_ref: 6.8.2 built_image_family: my-custom-slurm built_instance_image: family: $(vars.built_image_family) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 4d790fe703..d251dff2af 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 3f0ee54af8..7ca868a049 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 37b5da93da..9f4933a1fa 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions @@ -238,13 +238,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.1 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 9b105d7f39..1ce6ed158f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" project_id = var.project_id region = var.region @@ -99,7 +99,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index de97810316..998a8e0867 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 7254551072..0d05c71f91 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" for_each = local.nodeset_map project_id = var.project_id @@ -102,7 +102,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.2" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 0afd0bfee7..4ad20a6352 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0#slurm-on-google-cloud-platform +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2#slurm-on-google-cloud-platform ## Requirements diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index b817972331..705e1299eb 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -94,7 +94,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 67f33cde7d..dfc4d4ab4c 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/modules/README.md b/modules/README.md index b0575ec3db..722449e6e6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -230,7 +230,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md [slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.7.0 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md From 786b5c27e7286281e501089ed390ffebd920370a Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 15 Oct 2024 18:41:12 +0000 Subject: [PATCH 092/102] Fix deprecation link and add deprecation notice in v5 compute modules --- .../compute/schedmd-slurm-gcp-v5-node-group/README.md | 5 +++++ .../schedmd-slurm-gcp-v5-partition-dynamic/README.md | 5 +++++ .../compute/schedmd-slurm-gcp-v5-partition/README.md | 5 +++++ .../scheduler/schedmd-slurm-gcp-v5-controller/README.md | 6 ++---- .../modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md | 6 ++---- .../modules/scheduler/schedmd-slurm-gcp-v5-login/README.md | 6 ++---- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index d4cc3fcda3..bc54d36396 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -1,5 +1,10 @@ ## Description +> [!NOTE] +> Slurm-gcp-v5-node-group module is deprecated. See +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. + This module creates a node group data structure intended to be input to the [schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/) module. diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index f7ad53f382..cecea973e1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -1,5 +1,10 @@ ## Description +> [!NOTE] +> Slurm-gcp-v5-partition-dynamic module is deprecated. See +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. + This module creates a dynamic compute partition that can be used as input to the [schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md). This will configure the slurm partition to contain nodes with the corresponding feature. diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index 1ae1d0b50f..f9fcc59bed 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -1,5 +1,10 @@ ## Description +> [!NOTE] +> Slurm-gcp-v5-partition module is deprecated. See +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. + This module creates a compute partition that can be used as input to the [schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md). diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index b2a1bb503e..b9ae2ce50c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -2,10 +2,8 @@ > [!NOTE] > Slurm-gcp-v5-controller module is deprecated. See -> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations -> and timelines. - -* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. This module creates a slurm controller node via the [SchedMD/slurm-gcp] [slurm\_controller\_instance] and [slurm\_instance\_template] modules. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 82fa0b9771..56cbc33b07 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -2,10 +2,8 @@ > [!NOTE] > Slurm-gcp-v5-hybrid module is deprecated. See -> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations -> and timelines. - -* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD as part of the [slurm-gcp] github repository. The hybrid module serves to create diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 80d969ade6..44b337ec78 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -2,10 +2,8 @@ > [!NOTE] > Slurm-gcp-v5-login module is deprecated. See -> [this update](#completed-migration-to-slurm-gcp-v6) for specific recommendations -> and timelines. - -* [Completed Migration to Slurm-GCP v6](../../../../modules/README.md#completed-migration-to-slurm-gcp-v6) +> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) +> for specific recommendations and timelines. This module creates a login node for a Slurm cluster based on the [SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance] From 0eb20fe431546ba6c208321c8e5fe81cc3b8d88c Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Tue, 15 Oct 2024 22:32:45 +0000 Subject: [PATCH 093/102] fix readme --- modules/compute/gke-node-pool/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index e6d3bdb42c..880e1834e4 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -277,7 +277,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | @@ -288,27 +288,27 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | | [internal\_ghpc\_module\_id](#input\_internal\_ghpc\_module\_id) | DO NOT SET THIS MANUALLY. Automatically populates with module id (unique blueprint-wide). | `string` | n/a | yes | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If not set, automatically populated by machine type and module id (unique blueprint-wide) as suffix.
If setting manually, ensure a unique value across all gke-node-pools. | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | -| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | -| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | +| [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | | [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no | From d8306aed8bcc96aa0dc4a58425399a6ba9f1d27a Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 15 Oct 2024 19:07:40 -0700 Subject: [PATCH 094/102] Make spack and ramble bucket names look like startup-script bucket names --- community/modules/scripts/ramble-setup/main.tf | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index b1b445fc22..8a5efddfe6 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -73,7 +73,7 @@ locals { } bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8) - bucket_name = "ramble-scripts-${local.bucket_md5}" + bucket_name = "${var.deployment_name}-ramble-scripts-${local.bucket_md5}" runners = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner] combined_runner = { diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index 6a1eb21312..eff6a8f9b8 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -80,7 +80,7 @@ locals { } bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8) - bucket_name = "spack-scripts-${local.bucket_md5}" + bucket_name = "${var.deployment_name}-spack-scripts-${local.bucket_md5}" runners = [local.install_spack_deps_runner, local.install_spack_runner] combined_runner = { From 8e57bcc3ca5127c7222934560ee9317927ee3a63 Mon Sep 17 00:00:00 2001 From: annuay Date: Wed, 16 Oct 2024 19:44:05 +0000 Subject: [PATCH 095/102] share SA across node pools --- examples/storage-gke.yaml | 14 +------ .../daily-tests/blueprints/ml-gke-e2e.yaml | 42 ++----------------- 2 files changed, 4 insertions(+), 52 deletions(-) diff --git a/examples/storage-gke.yaml b/examples/storage-gke.yaml index 00c3d60290..a257f97c49 100644 --- a/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -50,18 +50,6 @@ deployment_groups: - storage.objectViewer - artifactregistry.reader - - id: local_ssd_pool_service_account - source: community/modules/project/service-account - settings: - name: ssd-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - id: gke_cluster source: modules/scheduler/gke-cluster use: [network1, gke_service_account] @@ -142,7 +130,7 @@ deployment_groups: - id: local-ssd-pool source: modules/compute/gke-node-pool - use: [gke_cluster, local_ssd_pool_service_account] + use: [gke_cluster, gke_service_account] settings: name: local-ssd machine_type: n2d-standard-2 diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index 20f5ff19f5..d7be384115 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -92,21 +92,9 @@ deployment_groups: ] outputs: [instructions] - - id: n1_service_account - source: community/modules/project/service-account - settings: - name: n1-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - id: n1_pool_default source: modules/compute/gke-node-pool - use: [gke_cluster, n1_service_account] + use: [gke_cluster, gke_service_account] settings: name: n1-pool-default disk_type: pd-balanced @@ -132,21 +120,9 @@ deployment_groups: ] outputs: [instructions] - - id: n1_full_service_account - source: community/modules/project/service-account - settings: - name: n1-full-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - id: n1_pool_full_spec source: modules/compute/gke-node-pool - use: [gke_cluster, n1_full_service_account] + use: [gke_cluster, gke_service_account] settings: name: n1-pool-full-spec disk_type: pd-balanced @@ -177,21 +153,9 @@ deployment_groups: ] outputs: [instructions] - - id: default_settings_service_account - source: community/modules/project/service-account - settings: - name: ds-sa - project_roles: - - logging.logWriter - - monitoring.metricWriter - - monitoring.viewer - - stackdriver.resourceMetadata.writer - - storage.objectViewer - - artifactregistry.reader - - id: default_settings_pool source: modules/compute/gke-node-pool - use: [gke_cluster, default_settings_service_account] + use: [gke_cluster, gke_service_account] settings: name: default-settings-pool From 2a488b81521d59dd70e6ad5eb81251535a7b2d09 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 17 Oct 2024 00:11:53 +0000 Subject: [PATCH 096/102] SlurmGCP. Improve reservation_name parsing logic + tests --- .../slurm_files/scripts/tests/common.py | 4 +- .../slurm_files/scripts/tests/test_util.py | 85 ++++++++++++++++++- .../modules/slurm_files/scripts/util.py | 14 +-- 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py index 8db9add6c3..2272aeef99 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -29,11 +29,13 @@ class Placeholder: @dataclass class TstNodeset: - nodeset_name: str + nodeset_name: str = "cantor" node_count_static: int = 0 node_count_dynamic_max: int = 0 node_conf: dict[str, Any] = field(default_factory=dict) instance_template: Optional[str] = None + reservation_name: Optional[str] = "" + zone_policy_allow: Optional[list[str]] = field(default_factory=list) @dataclass class TstCfg: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 4dd3c8a17b..676b3593aa 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -13,7 +13,8 @@ # limitations under the License. import pytest -import common # needed to import util +from mock import Mock +from common import TstNodeset, TstCfg # needed to import util import util from google.api_core.client_options import ClientOptions # noqa: E402 @@ -130,3 +131,85 @@ def test_create_client_options( ud_mock.return_value = "googleapis.com" ep_mock.return_value = ep_ver assert util.create_client_options(api).__repr__() == expected.__repr__() + + + +@pytest.mark.parametrize( + "nodeset,err", + [ + (TstNodeset(reservation_name="projects/x/reservations/y"), AssertionError), # no zones + (TstNodeset( + reservation_name="projects/x/reservations/y", + zone_policy_allow=["eine", "zwei"]), AssertionError), # multiples zones + (TstNodeset( + reservation_name="robin", + zone_policy_allow=["eine"]), ValueError), # invalid name + (TstNodeset( + reservation_name="projects/reservations/y", + zone_policy_allow=["eine"]), ValueError), # invalid name + (TstNodeset( + reservation_name="projects/x/zones/z/reservations/y", + zone_policy_allow=["eine"]), ValueError), # invalid name + ] +) +def test_nodeset_reservation_err(nodeset, err): + lkp = util.Lookup(TstCfg()) + lkp._get_reservation = Mock() + with pytest.raises(err): + lkp.nodeset_reservation(nodeset) + lkp._get_reservation.assert_not_called() + +@pytest.mark.parametrize( + "nodeset,policies,expected", + [ + (TstNodeset(), [], None), # no reservation + (TstNodeset( + reservation_name="projects/bobin/reservations/robin", + zone_policy_allow=["eine"]), + [], + util.ReservationDetails( + project="bobin", + zone="eine", + name="robin", + policies=[], + bulk_insert_name="projects/bobin/reservations/robin")), + (TstNodeset( + reservation_name="projects/bobin/reservations/robin", + zone_policy_allow=["eine"]), + ["seven/wanders", "five/red/apples", "yum"], + util.ReservationDetails( + project="bobin", + zone="eine", + name="robin", + policies=["wanders", "apples", "yum"], + bulk_insert_name="projects/bobin/reservations/robin")), + (TstNodeset( + reservation_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6", + zone_policy_allow=["eine"]), + [], + util.ReservationDetails( + project="bobin", + zone="eine", + name="robin", + policies=[], + reservation_block="cheese-brie-6", + bulk_insert_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6")), + + ]) + +def test_nodeset_reservation_ok(nodeset, policies, expected): + lkp = util.Lookup(TstCfg()) + lkp._get_reservation = Mock() + + if not expected: + assert lkp.nodeset_reservation(nodeset) is None + lkp._get_reservation.assert_not_called() + return + + lkp._get_reservation.return_value = { + "resourcePolicies": {i: p for i, p in enumerate(policies)}, + } + assert lkp.nodeset_reservation(nodeset) == expected + lkp._get_reservation.assert_called_once_with(expected.project, expected.zone, expected.name) + + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index eaf455e8dd..f93c9db4b9 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1447,8 +1447,10 @@ def delete_node(self, nodename): class ReservationDetails: project: str zone: str + name: str policies: List[str] # names (not URLs) of resource policies - bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name) + bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format) + reservation_block: Optional[str] = None class Lookup: """Wrapper class for cached data access""" @@ -1754,13 +1756,13 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: assert len(zones) == 1, "Only single zone is supported if using a reservation" zone = zones[0] - try: - _, project, _, name = nodeset.reservation_name.split("/") - except ValueError: + regex = re.compile(r'^projects/(?P[^/]+)/reservations/(?P[^/]+)(/reservationBlocks/(?P[^/]+))?$') + if not (match := regex.match(nodeset.reservation_name)): raise ValueError( - f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" + f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME[/reservationBlocks/BLOCK]'" ) + project, name, block = match.group("project", "reservation", "block") reservation = self._get_reservation(project, zone, name) # Converts policy URLs to names, e.g.: @@ -1770,7 +1772,9 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: return ReservationDetails( project=project, zone=zone, + name=name, policies=policies, + reservation_block=block, bulk_insert_name=nodeset.reservation_name) @lru_cache(maxsize=1) From 5835429de35c2314ed2b243626cb1d65c1ef5482 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 17 Oct 2024 01:32:49 +0000 Subject: [PATCH 097/102] SlurmGCP. Slurm reservation handling simplification --- .../modules/slurm_files/scripts/tests/test_util.py | 5 ++--- .../modules/slurm_files/scripts/util.py | 8 +++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 676b3593aa..14b7a7bf62 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -184,7 +184,7 @@ def test_nodeset_reservation_err(nodeset, err): policies=["wanders", "apples", "yum"], bulk_insert_name="projects/bobin/reservations/robin")), (TstNodeset( - reservation_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6", + reservation_name="projects/bobin/reservations/robin/snek/cheese-brie-6", zone_policy_allow=["eine"]), [], util.ReservationDetails( @@ -192,8 +192,7 @@ def test_nodeset_reservation_err(nodeset, err): zone="eine", name="robin", policies=[], - reservation_block="cheese-brie-6", - bulk_insert_name="projects/bobin/reservations/robin/reservationBlocks/cheese-brie-6")), + bulk_insert_name="projects/bobin/reservations/robin/snek/cheese-brie-6")), ]) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index f93c9db4b9..8467e300e2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1450,7 +1450,6 @@ class ReservationDetails: name: str policies: List[str] # names (not URLs) of resource policies bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format) - reservation_block: Optional[str] = None class Lookup: """Wrapper class for cached data access""" @@ -1756,13 +1755,13 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: assert len(zones) == 1, "Only single zone is supported if using a reservation" zone = zones[0] - regex = re.compile(r'^projects/(?P[^/]+)/reservations/(?P[^/]+)(/reservationBlocks/(?P[^/]+))?$') + regex = re.compile(r'^projects/(?P[^/]+)/reservations/(?P[^/]+)(/.*)?$') if not (match := regex.match(nodeset.reservation_name)): raise ValueError( - f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME[/reservationBlocks/BLOCK]'" + f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" ) - project, name, block = match.group("project", "reservation", "block") + project, name = match.group("project", "reservation") reservation = self._get_reservation(project, zone, name) # Converts policy URLs to names, e.g.: @@ -1774,7 +1773,6 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: zone=zone, name=name, policies=policies, - reservation_block=block, bulk_insert_name=nodeset.reservation_name) @lru_cache(maxsize=1) From d68ee1efbcc08278351044ed2c62b06e7d2f1293 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 16 Oct 2024 19:30:00 -0700 Subject: [PATCH 098/102] Don't allow bucket_name to exceed 63 chars --- community/modules/scripts/ramble-setup/main.tf | 7 +++++-- community/modules/scripts/spack-setup/main.tf | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 8a5efddfe6..205c980c03 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -72,8 +72,11 @@ locals { "destination" = "install_ramble.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8) - bucket_name = "${var.deployment_name}-ramble-scripts-${local.bucket_md5}" + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8) + # Max bucket name length is 63, so truncate deployment_name if necessary. + # The string "-ramble-scripts-" is 16 characters and bucket_md5 is 8 characters, + # leaving 63-16-8=39 chars for deployment_name. + bucket_name = "${substr(var.deployment_name, 0, 39)}-ramble-scripts-${local.bucket_md5}" runners = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner] combined_runner = { diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index eff6a8f9b8..b705ccc06c 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -79,8 +79,12 @@ locals { "destination" = "install_spack.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8) - bucket_name = "${var.deployment_name}-spack-scripts-${local.bucket_md5}" + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8) + # Max bucket name length is 63, so truncate deployment_name if necessary. + # The string "-spack-scripts-" is 15 characters and bucket_md5 is 8 characters, + # leaving 63-15-8=40 chars for deployment_name. Using 39 so it has the same prefix as the + # ramble-setup module's GCS bucket. + bucket_name = "${substr(var.deployment_name, 0, 39)}-spack-scripts-${local.bucket_md5}" runners = [local.install_spack_deps_runner, local.install_spack_runner] combined_runner = { From 6d412e7f6ce23e3e4cb8d5af0ea21b9c72ea7d62 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 15 Oct 2024 18:49:38 +0000 Subject: [PATCH 099/102] updating version constraint --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index a58ce74a41..b79babfbe5 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.7.0", + Version: ">= 4.84.0, < 6.8.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.7.0", + Version: ">= 4.84.0, < 6.8.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 59495832d4..5abdd6620d 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.7.0"}, + Version: ">= 4.84.0, < 6.8.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.7.0"}}) + Version: ">= 4.84.0, < 6.8.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index ba265ba2ee..c3f9926b11 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 3534fd124e..3dd3e12681 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index 5736fbba16..d9c215a457 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -79,14 +79,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 3534fd124e..3dd3e12681 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 3534fd124e..3dd3e12681 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index c21a1bb32f..46614b02e6 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 3534fd124e..3dd3e12681 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index ad79aee614..2c5e9ca64a 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.7.0' + version: '>= 4.84.0, < 6.8.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 3534fd124e..3dd3e12681 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.7.0" + version = ">= 4.84.0, < 6.8.0" } } } From e3293120867aad46b961fd0328dd2adf96a40616 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 17 Oct 2024 17:10:14 -0500 Subject: [PATCH 100/102] Refactor default value for mountpoint in local SSD solution --- modules/scripts/startup-script/files/setup-raid.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml index 429d2b5594..585c6023f3 100644 --- a/modules/scripts/startup-script/files/setup-raid.yml +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -23,6 +23,7 @@ fstype: ext4 interface: nvme mode: '0755' + mountpoint: /mnt/{{ raid_name }} tasks: - name: Get local SSD devices ansible.builtin.find: @@ -61,7 +62,7 @@ - name: Mount RAID array ansible.posix.mount: src: "{{ array_dev }}" - path: '{{ mountpoint | default("/mnt/" + raid_name) }}' + path: "{{ mountpoint }}" fstype: "{{ fstype }}" # the nofail option is critical as it enables the boot process to complete on machines # that were powered off and had local SSD contents discarded; without this option @@ -71,6 +72,6 @@ - name: Set mount permissions ansible.builtin.file: - path: '{{ mountpoint | default("/mnt/" + raid_name) }}' + path: "{{ mountpoint }}" state: directory mode: "{{ mode }}" From fa3f3a6271fa4b7126f4b783fb34eae6cbb5456d Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 17 Oct 2024 17:10:14 -0500 Subject: [PATCH 101/102] Ensure local SSD solutions works upon reboot of Slurm nodes When the local SSD mountpoint has not been mounted use SystemD to create the RAID array and format it. This addresses the known behavior of the Slurm-GCP solution in which it does not re-run startup-scripts upon a power off/on (or reboot) cycle. During a typical power off/on cycle, the local SSD contents are discarded and the disks must be re-assembled and formatted. --- .../startup-script/files/setup-raid.yml | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml index 585c6023f3..d7590069a8 100644 --- a/modules/scripts/startup-script/files/setup-raid.yml +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -41,23 +41,34 @@ name: mdadm state: present - - name: Force RAID array if only 1 local SSD - ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices=1 /dev/disk/by-id/google-local-nvme-ssd-0 --force - args: - creates: "{{ array_dev }}" - when: local_ssd_devices.files | length == 1 + # this service will act during the play and upon reboots to ensure that local + # SSD volumes are always assembled into a RAID and re-formatted if necessary; + # there are many scenarios where a VM can be stopped or migrated during + # maintenance and the contents of local SSD will be discarded + - name: Install service to create local SSD RAID and format it + ansible.builtin.copy: + dest: /etc/systemd/system/create-localssd-raid.service + mode: 0644 + content: | + [Unit] + After=local-fs.target + Before=slurmd.service + ConditionPathIsMountPoint=!{{ mountpoint }} - - name: Create RAID array - ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-* - args: - creates: "{{ array_dev }}" - when: local_ssd_devices.files | length >= 2 + [Service] + Type=oneshot + ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}" + ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }} - - name: Format filesystem - community.general.filesystem: - fstype: "{{ fstype }}" - device: "{{ array_dev }}" - opts: '{{ "-m 0" if fstype == "ext4" else "" }}' + [Install] + WantedBy=slurmd.service + + - name: Create RAID array and format + ansible.builtin.systemd: + name: create-localssd-raid.service + state: started + enabled: true + daemon_reload: true - name: Mount RAID array ansible.posix.mount: From 06a71f424dba5e6944383411751869bbbacce8c1 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 23 Oct 2024 20:51:54 +0000 Subject: [PATCH 102/102] Update version number to v1.41.0 as part of release process --- cmd/root.go | 2 +- community/examples/tutorial-starccm-slurm.yaml | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/compute/mig/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/network/private-service-access/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- .../hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 +- modules/compute/gke-node-pool/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/file-system/gke-persistent-volume/versions.tf | 2 +- modules/file-system/gke-storage/versions.tf | 2 +- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/firewall-rules/versions.tf | 2 +- modules/network/pre-existing-subnetwork/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scheduler/gke-cluster/versions.tf | 2 +- modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 41 files changed, 47 insertions(+), 47 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 03717b99d7..e58b8a743d 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.40.0", + Version: "v1.41.0", Annotations: annotation, } ) diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index b74eb44d33..9e64014ea7 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -16,7 +16,7 @@ blueprint_name: starccm-on-slurm toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.40.0 +toolkit_modules_version: v1.41.0 vars: project_id: ## Set GCP Project ID Here ## diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index efd3cab932..3f320827a1 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.41.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index a6e80e8b0e..8e5b3caa45 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.41.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 1b9cd77ff6..51f49882a1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.41.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 6265b12713..4f00828f19 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.41.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 4e98f061c8..9e7273093a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.41.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 66b6296071..f519a18161 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.41.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index eef0010b85..242244c5f7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.41.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 25ec7739f0..17489d3f93 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.41.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 7c6a50bb46..1e92271e3a 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index e3b1236384..0a6664171a 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.41.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index d58278f078..5251b527b0 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.41.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 93e7b8b841..469e310bc0 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index 8b7f0fb043..efb0f8f2d1 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 25f653ee43..974520409d 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.41.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index 66bc2f104c..5597272dca 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 40c5aedf9d..2a3e2fb59b 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.41.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index cf4956236b..3d452c24bb 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.41.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 8b3a1fc44c..432b506666 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.41.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 5ba656f88c..103fe43a30 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.41.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index f6581de261..d9e1f9b600 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.41.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 91831924a1..c52321d462 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.41.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 3314b7828f..c1fc007bf0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.41.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 37229bb041..dbcebd21c1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.41.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 3e9954b5ee..e60ec22c3c 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.41.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 777a8e68ca..1a2aa18a3b 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.41.0" } required_version = ">= 0.14.0" diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index 0220352d35..813a90f0b6 100644 --- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -16,7 +16,7 @@ blueprint_name: hpc-cluster-hybrid-v5 toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.40.0 +toolkit_modules_version: v1.41.0 vars: project_id: ## <> diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 2a27bfc342..0f4cb13c2f 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.41.0" } } diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 228e58fe84..2d35e5c50e 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 593345e994..3454ca00c6 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index c0f5298369..b87efd8a16 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.41.0" } } diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index 78d62b235d..27f82792ab 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = ">= 1.0" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.41.0" } } diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 1db6bd5151..dbf59fa86f 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.41.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 2daef71bf4..5312b04355 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.41.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index 8d9b9f0578..7a38f30404 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.41.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 2794d4d0b0..c9f1ec5992 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.41.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index b58cb8fb08..599294a84e 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.41.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index ad17fe1c43..67c30a9e84 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -34,6 +34,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.41.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 3ad8745340..328bdda8e1 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.41.0" } required_version = ">= 1.3" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 0d44be1243..c954c7e6fa 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.40.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.41.0" } required_version = ">= 1.3"