From 3265a94032d745ca65e03bd125e91ccc55d5ad6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Wed, 3 Jul 2024 12:27:03 +0200 Subject: [PATCH 1/7] Adding TPU limits for GKE cluster node auto-provisioning (NAP) --- modules/gke-cluster-standard/README.md | 42 +++++++++++------------ modules/gke-cluster-standard/main.tf | 13 +++++++ modules/gke-cluster-standard/variables.tf | 5 +++ 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 40a6cdc6f0..27cb36a41c 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -310,28 +310,28 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [location](variables.tf#L237) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L371) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L410) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L421) | VPC-level configuration. | object({…}) | ✓ | | +| [location](variables.tf#L242) | Cluster zone or region. | string | ✓ | | +| [name](variables.tf#L376) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L415) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L426) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | -| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | -| [default_nodepool](variables.tf#L118) | Enable default nodepool. | object({…}) | | {} | -| [deletion_protection](variables.tf#L136) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | -| [description](variables.tf#L143) | Cluster description. | string | | null | -| [enable_addons](variables.tf#L149) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | -| [enable_features](variables.tf#L174) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | -| [issue_client_certificate](variables.tf#L224) | Enable issuing client certificate. | bool | | false | -| [labels](variables.tf#L230) | Cluster resource labels. | map(string) | | {} | -| [logging_config](variables.tf#L242) | Logging configuration. | object({…}) | | {} | -| [maintenance_config](variables.tf#L263) | Maintenance window configuration. | object({…}) | | {…} | -| [max_pods_per_node](variables.tf#L286) | Maximum number of pods per node in this cluster. | number | | 110 | -| [min_master_version](variables.tf#L292) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L298) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_config](variables.tf#L376) | Node-level configuration. | object({…}) | | {} | -| [node_locations](variables.tf#L389) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [private_cluster_config](variables.tf#L396) | Private cluster configuration. | object({…}) | | null | -| [release_channel](variables.tf#L415) | Release channel for GKE upgrades. | string | | null | +| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | +| [default_nodepool](variables.tf#L123) | Enable default nodepool. | object({…}) | | {} | +| [deletion_protection](variables.tf#L141) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | +| [description](variables.tf#L148) | Cluster description. | string | | null | +| [enable_addons](variables.tf#L154) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | +| [enable_features](variables.tf#L179) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | +| [issue_client_certificate](variables.tf#L229) | Enable issuing client certificate. | bool | | false | +| [labels](variables.tf#L235) | Cluster resource labels. | map(string) | | {} | +| [logging_config](variables.tf#L247) | Logging configuration. | object({…}) | | {} | +| [maintenance_config](variables.tf#L268) | Maintenance window configuration. | object({…}) | | {…} | +| [max_pods_per_node](variables.tf#L291) | Maximum number of pods per node in this cluster. | number | | 110 | +| [min_master_version](variables.tf#L297) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | +| [monitoring_config](variables.tf#L303) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_config](variables.tf#L381) | Node-level configuration. | object({…}) | | {} | +| [node_locations](variables.tf#L394) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [private_cluster_config](variables.tf#L401) | Private cluster configuration. | object({…}) | | null | +| [release_channel](variables.tf#L420) | Release channel for GKE upgrades. | string | | null | ## Outputs diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index 8cea6cebc7..bbd4a1d993 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -233,6 +233,19 @@ resource "google_container_cluster" "cluster" { maximum = gpu_resources.value.max } } + dynamic "resource_limits" { + for_each = ( + try(local.cas.tpu_resources, null) == null + ? [] + : local.cas.tpu_resources + ) + iterator = tpu_resources + content { + resource_type = tpu_resources.value.resource_type + minimum = tpu_resources.value.min + maximum = tpu_resources.value.max + } + } } } dynamic "database_encryption" { diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index 63e16df12f..aa85b04f55 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -85,6 +85,11 @@ variable "cluster_autoscaling" { min = number max = number }))) + tpu_resources = optional(list(object({ + resource_type = string + min = number + max = number + }))) }) default = null validation { From 995c796307f82f61c26c22f3fa2d3ca9d05aa13a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Tue, 9 Jul 2024 00:10:25 +0200 Subject: [PATCH 2/7] rework of the cluster autoscaling configuration --- modules/gke-cluster-standard/README.md | 37 +++++++++++++++++++++++ modules/gke-cluster-standard/main.tf | 25 ++++----------- modules/gke-cluster-standard/variables.tf | 13 +++----- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 27cb36a41c..2210c3ecb6 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -305,6 +305,43 @@ module "cluster-1" { } # tftest modules=1 resources=1 ``` + +### Node auto-provisioning with GPUs and TPUs + +You can use `var.cluster_autoscaling` block to configure node auto-provisioning for the GKE cluster. The example below configures limits for CPU, memory, GPUs and TPUs. + +```hcl +module "cluster-1" { + source = "./fabric/modules/gke-cluster-standard" + project_id = var.project_id + name = "cluster-1" + location = "europe-west1-b" + vpc_config = { + network = var.vpc.self_link + subnetwork = var.subnet.self_link + secondary_range_blocks = {} + } + cluster_autoscaling = { + cpu_limits = { + max = 48 + } + mem_limits = { + max = 182 + } + # Can be GPUs or TPUs + accelerator_resources = [{ + resource_type = "nvidia-l4" + max = 2 + }, + { + resource_type = "tpu-v5-lite-podslice" + max = 2 + } + ] + } +} +# tftest modules=1 resources=1 +``` ## Variables diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index bbd4a1d993..af836c542a 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -222,28 +222,15 @@ resource "google_container_cluster" "cluster" { } dynamic "resource_limits" { for_each = ( - try(local.cas.gpu_resources, null) == null + try(local.cas.accelerator_resources, null) == null ? [] - : local.cas.gpu_resources + : local.cas.accelerator_resources ) - iterator = gpu_resources + iterator = accelerator_resources content { - resource_type = gpu_resources.value.resource_type - minimum = gpu_resources.value.min - maximum = gpu_resources.value.max - } - } - dynamic "resource_limits" { - for_each = ( - try(local.cas.tpu_resources, null) == null - ? [] - : local.cas.tpu_resources - ) - iterator = tpu_resources - content { - resource_type = tpu_resources.value.resource_type - minimum = tpu_resources.value.min - maximum = tpu_resources.value.max + resource_type = accelerator_resources.value.resource_type + minimum = accelerator_resources.value.min + maximum = accelerator_resources.value.max } } } diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index aa85b04f55..8dbf810141 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -73,21 +73,16 @@ variable "cluster_autoscaling" { # add validation rule to ensure only one is present if upgrade settings is defined })) cpu_limits = optional(object({ - min = number + min = optional(number, 0) max = number })) mem_limits = optional(object({ - min = number + min = optional(number, 0) max = number })) - gpu_resources = optional(list(object({ + accelerator_resources = optional(list(object({ resource_type = string - min = number - max = number - }))) - tpu_resources = optional(list(object({ - resource_type = string - min = number + min = optional(number, 0) max = number }))) }) From 130181bae6d9b918b56d4cca0eb4ed03efbad13a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Tue, 9 Jul 2024 00:12:09 +0200 Subject: [PATCH 3/7] updated README --- modules/gke-cluster-standard/README.md | 43 +++++++++++++------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 2210c3ecb6..b4308bb696 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -15,6 +15,7 @@ This module offers a way to create and manage Google Kubernetes Engine (GKE) [St - [Cloud DNS](#cloud-dns) - [Backup for GKE](#backup-for-gke) - [Automatic creation of new secondary ranges](#automatic-creation-of-new-secondary-ranges) + - [Node auto-provisioning with GPUs and TPUs](#node-auto-provisioning-with-gpus-and-tpus) - [Variables](#variables) - [Outputs](#outputs) @@ -347,28 +348,28 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [location](variables.tf#L242) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L376) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L415) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L426) | VPC-level configuration. | object({…}) | ✓ | | +| [location](variables.tf#L237) | Cluster zone or region. | string | ✓ | | +| [name](variables.tf#L371) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L410) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L421) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | -| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | -| [default_nodepool](variables.tf#L123) | Enable default nodepool. | object({…}) | | {} | -| [deletion_protection](variables.tf#L141) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | -| [description](variables.tf#L148) | Cluster description. | string | | null | -| [enable_addons](variables.tf#L154) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | -| [enable_features](variables.tf#L179) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | -| [issue_client_certificate](variables.tf#L229) | Enable issuing client certificate. | bool | | false | -| [labels](variables.tf#L235) | Cluster resource labels. | map(string) | | {} | -| [logging_config](variables.tf#L247) | Logging configuration. | object({…}) | | {} | -| [maintenance_config](variables.tf#L268) | Maintenance window configuration. | object({…}) | | {…} | -| [max_pods_per_node](variables.tf#L291) | Maximum number of pods per node in this cluster. | number | | 110 | -| [min_master_version](variables.tf#L297) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L303) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_config](variables.tf#L381) | Node-level configuration. | object({…}) | | {} | -| [node_locations](variables.tf#L394) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [private_cluster_config](variables.tf#L401) | Private cluster configuration. | object({…}) | | null | -| [release_channel](variables.tf#L420) | Release channel for GKE upgrades. | string | | null | +| [cluster_autoscaling](variables.tf#L39) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | +| [default_nodepool](variables.tf#L118) | Enable default nodepool. | object({…}) | | {} | +| [deletion_protection](variables.tf#L136) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | +| [description](variables.tf#L143) | Cluster description. | string | | null | +| [enable_addons](variables.tf#L149) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | +| [enable_features](variables.tf#L174) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | +| [issue_client_certificate](variables.tf#L224) | Enable issuing client certificate. | bool | | false | +| [labels](variables.tf#L230) | Cluster resource labels. | map(string) | | {} | +| [logging_config](variables.tf#L242) | Logging configuration. | object({…}) | | {} | +| [maintenance_config](variables.tf#L263) | Maintenance window configuration. | object({…}) | | {…} | +| [max_pods_per_node](variables.tf#L286) | Maximum number of pods per node in this cluster. | number | | 110 | +| [min_master_version](variables.tf#L292) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | +| [monitoring_config](variables.tf#L298) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_config](variables.tf#L376) | Node-level configuration. | object({…}) | | {} | +| [node_locations](variables.tf#L389) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [private_cluster_config](variables.tf#L396) | Private cluster configuration. | object({…}) | | null | +| [release_channel](variables.tf#L415) | Release channel for GKE upgrades. | string | | null | ## Outputs From 1f9fb15ab6f37fa385a9e6127f4a68c38de8172b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Tue, 9 Jul 2024 00:23:02 +0200 Subject: [PATCH 4/7] adding queued_provisioning (DWS) attribute --- modules/gke-nodepool/README.md | 16 ++++++++-------- modules/gke-nodepool/main.tf | 7 +++++++ modules/gke-nodepool/variables.tf | 1 + 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 453fff8d62..983c8a5041 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -143,7 +143,7 @@ module "cluster-1-nodepool-gpu-1" { |---|---|:---:|:---:|:---:| | [cluster_name](variables.tf#L23) | Cluster name. | string | ✓ | | | [location](variables.tf#L48) | Cluster location. | string | ✓ | | -| [project_id](variables.tf#L181) | Cluster project id. | string | ✓ | | +| [project_id](variables.tf#L182) | Cluster project id. | string | ✓ | | | [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | string | | null | | [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | string | | null | | [k8s_labels](variables.tf#L34) | Kubernetes labels applied to each node. | map(string) | | {} | @@ -153,13 +153,13 @@ module "cluster-1-nodepool-gpu-1" { | [node_config](variables.tf#L65) | Node-level configuration. | object({…}) | | {…} | | [node_count](variables.tf#L124) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | | [node_locations](variables.tf#L136) | Node locations. | list(string) | | null | -| [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | object({…}) | | null | -| [pod_range](variables.tf#L168) | Pod secondary range configuration. | object({…}) | | null | -| [reservation_affinity](variables.tf#L186) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | -| [service_account](variables.tf#L196) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | -| [sole_tenant_nodegroup](variables.tf#L207) | Sole tenant node group. | string | | null | -| [tags](variables.tf#L213) | Network tags applied to nodes. | list(string) | | null | -| [taints](variables.tf#L219) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | +| [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | object({…}) | | null | +| [pod_range](variables.tf#L169) | Pod secondary range configuration. | object({…}) | | null | +| [reservation_affinity](variables.tf#L187) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | +| [service_account](variables.tf#L197) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | +| [sole_tenant_nodegroup](variables.tf#L208) | Sole tenant node group. | string | | null | +| [tags](variables.tf#L214) | Network tags applied to nodes. | list(string) | | null | +| [taints](variables.tf#L220) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | ## Outputs diff --git a/modules/gke-nodepool/main.tf b/modules/gke-nodepool/main.tf index b94ef697c8..61a11ad283 100644 --- a/modules/gke-nodepool/main.tf +++ b/modules/gke-nodepool/main.tf @@ -137,6 +137,13 @@ resource "google_container_node_pool" "nodepool" { } } + dynamic "queued_provisioning" { + for_each = try(var.nodepool_config.queued_provisioning, null) != null ? [""] : [] + content { + enabled = var.nodepool_config.queued_provisioning + } + } + node_config { boot_disk_kms_key = var.node_config.boot_disk_kms_key disk_size_gb = var.node_config.disk_size_gb diff --git a/modules/gke-nodepool/variables.tf b/modules/gke-nodepool/variables.tf index c970c5b1bd..bfbddfe4d9 100644 --- a/modules/gke-nodepool/variables.tf +++ b/modules/gke-nodepool/variables.tf @@ -157,6 +157,7 @@ variable "nodepool_config" { policy_name = optional(string) tpu_topology = optional(string) })) + queued_provisioning = optional(bool) upgrade_settings = optional(object({ max_surge = number max_unavailable = number From 5158683e411f15ab28faff8403b372603e970bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Tue, 9 Jul 2024 19:15:09 +0200 Subject: [PATCH 5/7] Adding support for DWS for GKE nodepools --- modules/gke-nodepool/README.md | 50 ++++++++++++++++++++++++++++++- modules/gke-nodepool/main.tf | 2 +- modules/gke-nodepool/variables.tf | 2 +- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 983c8a5041..6290c241eb 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -136,6 +136,54 @@ module "cluster-1-nodepool-gpu-1" { } # tftest modules=1 resources=2 inventory=guest-accelerator.yaml ``` + +### Dynamic Workload Scheduler (DWS) & node pool configuration +This example uses Dynamic Workload Scheduler (DWS) to configure a GPU nodepool. + +```hcl +module "cluster-1-nodepool-gpu-1" { + source = "./fabric/modules/gke-nodepool" + project_id = "myproject" + cluster_name = "cluster-1" + location = "europe-west4-a" + name = "nodepool-gpu-1" + k8s_labels = { environment = "dev" } + service_account = { + create = true + email = "nodepool-gpu-1" # optional + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + } + nnode_config = { + machine_type = "g2-standard-4" + disk_size_gb = 50 + disk_type = "pd-ssd" + ephemeral_ssd_count = 1 + gvnic = true + spot = true + guest_accelerator = { + type = "nvidia-l4" + count = 1 + gpu_driver = { + version = "LATEST" + } + } + } + nodepool_config = { + autoscaling = { + max_node_count = 10 + min_node_count = 0 + } + queued_provisioning = true + } + node_count = { + initial = 0 + } + reservation_affinity = { + consume_reservation_type = "NO_RESERVATION" + } +} +# tftest modules=1 resources=2 inventory=guest-accelerator.yaml +``` ## Variables @@ -153,7 +201,7 @@ module "cluster-1-nodepool-gpu-1" { | [node_config](variables.tf#L65) | Node-level configuration. | object({…}) | | {…} | | [node_count](variables.tf#L124) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | | [node_locations](variables.tf#L136) | Node locations. | list(string) | | null | -| [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | object({…}) | | null | +| [nodepool_config](variables.tf#L142) | Nodepool-level configuration. | object({…}) | | null | | [pod_range](variables.tf#L169) | Pod secondary range configuration. | object({…}) | | null | | [reservation_affinity](variables.tf#L187) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | | [service_account](variables.tf#L197) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | diff --git a/modules/gke-nodepool/main.tf b/modules/gke-nodepool/main.tf index 61a11ad283..5544a60066 100644 --- a/modules/gke-nodepool/main.tf +++ b/modules/gke-nodepool/main.tf @@ -138,7 +138,7 @@ resource "google_container_node_pool" "nodepool" { } dynamic "queued_provisioning" { - for_each = try(var.nodepool_config.queued_provisioning, null) != null ? [""] : [] + for_each = try(var.nodepool_config.queued_provisioning, false) ? [""] : [] content { enabled = var.nodepool_config.queued_provisioning } diff --git a/modules/gke-nodepool/variables.tf b/modules/gke-nodepool/variables.tf index bfbddfe4d9..1796674593 100644 --- a/modules/gke-nodepool/variables.tf +++ b/modules/gke-nodepool/variables.tf @@ -157,7 +157,7 @@ variable "nodepool_config" { policy_name = optional(string) tpu_topology = optional(string) })) - queued_provisioning = optional(bool) + queued_provisioning = optional(bool, false) upgrade_settings = optional(object({ max_surge = number max_unavailable = number From 3b6c5237e9c15eb1dd5349ddf20c347a1add5104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Wed, 10 Jul 2024 14:55:00 +0200 Subject: [PATCH 6/7] typo --- modules/gke-nodepool/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 6290c241eb..2d1392a713 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -153,7 +153,7 @@ module "cluster-1-nodepool-gpu-1" { email = "nodepool-gpu-1" # optional oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] } - nnode_config = { + node_config = { machine_type = "g2-standard-4" disk_size_gb = 50 disk_type = "pd-ssd" From b83e7fdc0a9cb92f77878c35d9a9005ed4128f6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Legrand?= Date: Wed, 10 Jul 2024 14:58:45 +0200 Subject: [PATCH 7/7] adding test for DWS --- modules/gke-nodepool/README.md | 6 +-- tests/modules/gke_nodepool/examples/dws.yaml | 39 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 tests/modules/gke_nodepool/examples/dws.yaml diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 2d1392a713..002bd832e7 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -141,12 +141,12 @@ module "cluster-1-nodepool-gpu-1" { This example uses Dynamic Workload Scheduler (DWS) to configure a GPU nodepool. ```hcl -module "cluster-1-nodepool-gpu-1" { +module "cluster-1-nodepool-dws" { source = "./fabric/modules/gke-nodepool" project_id = "myproject" cluster_name = "cluster-1" location = "europe-west4-a" - name = "nodepool-gpu-1" + name = "nodepool-dws" k8s_labels = { environment = "dev" } service_account = { create = true @@ -182,7 +182,7 @@ module "cluster-1-nodepool-gpu-1" { consume_reservation_type = "NO_RESERVATION" } } -# tftest modules=1 resources=2 inventory=guest-accelerator.yaml +# tftest modules=1 resources=2 inventory=dws.yaml ``` ## Variables diff --git a/tests/modules/gke_nodepool/examples/dws.yaml b/tests/modules/gke_nodepool/examples/dws.yaml new file mode 100644 index 0000000000..59f0ca4870 --- /dev/null +++ b/tests/modules/gke_nodepool/examples/dws.yaml @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1-nodepool-dws.google_container_node_pool.nodepool: + cluster: cluster-1 + location: europe-west4-a + name: nodepool-dws + node_config: + - boot_disk_kms_key: null + disk_size_gb: 50 + disk_type: pd-ssd + ephemeral_storage_config: + - local_ssd_count: 1 + ephemeral_storage_local_ssd_config: [] + guest_accelerator: + - count: 1 + gpu_driver_installation_config: + - gpu_driver_version: LATEST + gpu_partition_size: null + gpu_sharing_config: null + type: nvidia-l4 + gvnic: [] + machine_type: g2-standard-4 + project: myproject + +counts: + google_container_node_pool: 1