diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index 2419567950..53b57e8f80 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -310,28 +310,28 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [location](variables.tf#L153) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L264) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L290) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L313) | VPC-level configuration. | object({…}) | ✓ | | +| [location](variables.tf#L154) | Cluster zone or region. | string | ✓ | | +| [name](variables.tf#L265) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L291) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L314) | VPC-level configuration. | object({…}) | ✓ | | | [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…}) | | {} | | [cluster_autoscaling](variables.tf#L38) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | | [deletion_protection](variables.tf#L59) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | | [description](variables.tf#L66) | Cluster description. | string | | null | -| [enable_addons](variables.tf#L72) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | -| [enable_features](variables.tf#L95) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | -| [issue_client_certificate](variables.tf#L141) | Enable issuing client certificate. | bool | | false | -| [labels](variables.tf#L147) | Cluster resource labels. | map(string) | | null | -| [logging_config](variables.tf#L158) | Logging configuration. | object({…}) | | {} | -| [maintenance_config](variables.tf#L179) | Maintenance window configuration. | object({…}) | | {…} | -| [max_pods_per_node](variables.tf#L202) | Maximum number of pods per node in this cluster. | number | | 110 | -| [min_master_version](variables.tf#L208) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L214) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_locations](variables.tf#L269) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [private_cluster_config](variables.tf#L276) | Private cluster configuration. | object({…}) | | null | -| [release_channel](variables.tf#L295) | Release channel for GKE upgrades. | string | | null | -| [service_account](variables.tf#L301) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string | | null | -| [tags](variables.tf#L307) | Network tags applied to nodes. | list(string) | | null | +| [enable_addons](variables.tf#L72) | Addons enabled in the cluster (true means enabled). | object({…}) | | {…} | +| [enable_features](variables.tf#L96) | Enable cluster-level features. Certain features allow configuration. | object({…}) | | {…} | +| [issue_client_certificate](variables.tf#L142) | Enable issuing client certificate. | bool | | false | +| [labels](variables.tf#L148) | Cluster resource labels. | map(string) | | null | +| [logging_config](variables.tf#L159) | Logging configuration. | object({…}) | | {} | +| [maintenance_config](variables.tf#L180) | Maintenance window configuration. | object({…}) | | {…} | +| [max_pods_per_node](variables.tf#L203) | Maximum number of pods per node in this cluster. | number | | 110 | +| [min_master_version](variables.tf#L209) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | +| [monitoring_config](variables.tf#L215) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_locations](variables.tf#L270) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [private_cluster_config](variables.tf#L277) | Private cluster configuration. | object({…}) | | null | +| [release_channel](variables.tf#L296) | Release channel for GKE upgrades. | string | | null | +| [service_account](variables.tf#L302) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string | | null | +| [tags](variables.tf#L308) | Network tags applied to nodes. | list(string) | | null | ## Outputs diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index 90115501de..f5d8fe7511 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -83,6 +83,9 @@ resource "google_container_cluster" "cluster" { gcp_filestore_csi_driver_config { enabled = var.enable_addons.gcp_filestore_csi_driver } + gcs_fuse_csi_driver_config { + enabled = var.enable_addons.gcs_fuse_csi_driver + } kalm_config { enabled = var.enable_addons.kalm } diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index c88bb773e9..221f6b8a55 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -77,6 +77,7 @@ variable "enable_addons" { dns_cache = optional(bool, false) gce_persistent_disk_csi_driver = optional(bool, false) gcp_filestore_csi_driver = optional(bool, false) + gcs_fuse_csi_driver = optional(bool, false) horizontal_pod_autoscaling = optional(bool, false) http_load_balancing = optional(bool, false) istio = optional(object({ diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 9e70b8e19d..e7d2191132 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -103,6 +103,39 @@ module "cluster-1-nodepool-1" { } # tftest modules=1 resources=2 inventory=config.yaml ``` +### GPU Node & node pool configuration + +```hcl +module "cluster-1-nodepool-gpu-1" { + source = "./fabric/modules/gke-nodepool" + project_id = "myproject" + cluster_name = "cluster-1" + location = "europe-west4-a" + name = "nodepool-gpu-1" + labels = { environment = "dev" } + service_account = { + create = true + email = "nodepool-gpu-1" # optional + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + } + node_config = { + machine_type = "a2-highgpu-1g" + disk_size_gb = 50 + disk_type = "pd-ssd" + ephemeral_ssd_count = 1 + gvnic = true + spot = true + guest_accelerator = { + type = "nvidia-tesla-a100" + count = 1 + gpu_driver = { + version = "LATEST" + } + } + } +} +# tftest modules=1 resources=2 inventory=guest-accelerator.yaml +``` ## Variables @@ -110,22 +143,22 @@ module "cluster-1-nodepool-1" { |---|---|:---:|:---:|:---:| | [cluster_name](variables.tf#L23) | Cluster name. | string | ✓ | | | [location](variables.tf#L41) | Cluster location. | string | ✓ | | -| [project_id](variables.tf#L150) | Cluster project id. | string | ✓ | | +| [project_id](variables.tf#L166) | Cluster project id. | string | ✓ | | | [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | string | | null | | [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | string | | null | | [labels](variables.tf#L34) | Kubernetes labels applied to each node. | map(string) | | {} | | [max_pods_per_node](variables.tf#L46) | Maximum number of pods per node. | number | | null | | [name](variables.tf#L52) | Optional nodepool name. | string | | null | -| [node_config](variables.tf#L58) | Node-level configuration. | object({…}) | | {…} | -| [node_count](variables.tf#L97) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | -| [node_locations](variables.tf#L109) | Node locations. | list(string) | | null | -| [nodepool_config](variables.tf#L115) | Nodepool-level configuration. | object({…}) | | null | -| [pod_range](variables.tf#L137) | Pod secondary range configuration. | object({…}) | | null | -| [reservation_affinity](variables.tf#L155) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | -| [service_account](variables.tf#L165) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | -| [sole_tenant_nodegroup](variables.tf#L176) | Sole tenant node group. | string | | null | -| [tags](variables.tf#L182) | Network tags applied to nodes. | list(string) | | null | -| [taints](variables.tf#L188) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | +| [node_config](variables.tf#L58) | Node-level configuration. | object({…}) | | {…} | +| [node_count](variables.tf#L113) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | +| [node_locations](variables.tf#L125) | Node locations. | list(string) | | null | +| [nodepool_config](variables.tf#L131) | Nodepool-level configuration. | object({…}) | | null | +| [pod_range](variables.tf#L153) | Pod secondary range configuration. | object({…}) | | null | +| [reservation_affinity](variables.tf#L171) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | +| [service_account](variables.tf#L181) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | +| [sole_tenant_nodegroup](variables.tf#L192) | Sole tenant node group. | string | | null | +| [tags](variables.tf#L198) | Network tags applied to nodes. | list(string) | | null | +| [taints](variables.tf#L204) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | ## Outputs diff --git a/modules/gke-nodepool/main.tf b/modules/gke-nodepool/main.tf index a5dff043cd..f5a104bd54 100644 --- a/modules/gke-nodepool/main.tf +++ b/modules/gke-nodepool/main.tf @@ -1,5 +1,5 @@ /** - * Copyright 2022 Google LLC + * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -165,7 +165,28 @@ resource "google_container_node_pool" "nodepool" { content { count = var.node_config.guest_accelerator.count type = var.node_config.guest_accelerator.type - gpu_partition_size = var.node_config.guest_accelerator.gpu_partition_size + gpu_partition_size = var.node_config.guest_accelerator.gpu_driver == null ? null : var.node_config.guest_accelerator.gpu_driver.partition_size + + dynamic "gpu_sharing_config" { + for_each = var.node_config.guest_accelerator.gpu_driver != null ? [""] : [] + content { + gpu_sharing_strategy = var.node_config.guest_accelerator.gpu_driver.max_shared_clients_per_gpu != null ? "TIME_SHARING" : null + max_shared_clients_per_gpu = var.node_config.guest_accelerator.gpu_driver.max_shared_clients_per_gpu + } + } + + dynamic "gpu_driver_installation_config" { + for_each = var.node_config.guest_accelerator.gpu_driver != null ? [""] : [] + content { + gpu_driver_version = var.node_config.guest_accelerator.gpu_driver.version + } + } + } + } + dynamic "local_nvme_ssd_block_config" { + for_each = coalesce(var.node_config.local_nvme_ssd_count, 0) > 0 ? [""] : [] + content { + local_ssd_count = var.node_config.local_nvme_ssd_count } } dynamic "gvnic" { diff --git a/modules/gke-nodepool/variables.tf b/modules/gke-nodepool/variables.tf index 4cedb26a94..17cfd88c5a 100644 --- a/modules/gke-nodepool/variables.tf +++ b/modules/gke-nodepool/variables.tf @@ -64,12 +64,17 @@ variable "node_config" { ephemeral_ssd_count = optional(number) gcfs = optional(bool, false) guest_accelerator = optional(object({ - count = number - type = string - gpu_partition_size = optional(string) + count = number + type = string + gpu_driver = optional(object({ + version = string + partition_size = optional(string) + max_shared_clients_per_gpu = optional(number) + })) })) - gvnic = optional(bool, false) - image_type = optional(string) + local_nvme_ssd_count = optional(number) + gvnic = optional(bool, false) + image_type = optional(string) kubelet_config = optional(object({ cpu_manager_policy = string cpu_cfs_quota = optional(bool) @@ -92,6 +97,17 @@ variable "node_config" { default = { disk_type = "pd-balanced" } + validation { + condition = ( + alltrue([ + for k, v in var.node_config.guest_accelerator[*].gpu_driver : contains([ + "GPU_DRIVER_VERSION_UNSPECIFIED", "INSTALLATION_DISABLED", + "DEFAULT", "LATEST" + ], v.version) + ]) + ) + error_message = "Invalid GPU driver version." + } } variable "node_count" { diff --git a/tests/modules/gke_nodepool/examples/guest-accelerator.yaml b/tests/modules/gke_nodepool/examples/guest-accelerator.yaml new file mode 100644 index 0000000000..5f125ef6f5 --- /dev/null +++ b/tests/modules/gke_nodepool/examples/guest-accelerator.yaml @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1-nodepool-gpu-1.google_container_node_pool.nodepool: + cluster: cluster-1 + location: europe-west4-a + name: nodepool-gpu-1 + node_config: + - boot_disk_kms_key: null + disk_size_gb: 50 + disk_type: pd-ssd + ephemeral_storage_config: + - local_ssd_count: 1 + ephemeral_storage_local_ssd_config: [] + guest_accelerator: + - count: 1 + gpu_driver_installation_config: + - gpu_driver_version: LATEST + gpu_partition_size: null + gpu_sharing_config: + - gpu_sharing_strategy: null + max_shared_clients_per_gpu: null + type: nvidia-tesla-a100 + gvnic: [] + machine_type: a2-highgpu-1g + spot: true + project: myproject + +counts: + google_container_node_pool: 1