diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md
index 2419567950..53b57e8f80 100644
--- a/modules/gke-cluster-standard/README.md
+++ b/modules/gke-cluster-standard/README.md
@@ -310,28 +310,28 @@ module "cluster-1" {
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
-| [location](variables.tf#L153) | Cluster zone or region. | string
| ✓ | |
-| [name](variables.tf#L264) | Cluster name. | string
| ✓ | |
-| [project_id](variables.tf#L290) | Cluster project id. | string
| ✓ | |
-| [vpc_config](variables.tf#L313) | VPC-level configuration. | object({…})
| ✓ | |
+| [location](variables.tf#L154) | Cluster zone or region. | string
| ✓ | |
+| [name](variables.tf#L265) | Cluster name. | string
| ✓ | |
+| [project_id](variables.tf#L291) | Cluster project id. | string
| ✓ | |
+| [vpc_config](variables.tf#L314) | VPC-level configuration. | object({…})
| ✓ | |
| [backup_configs](variables.tf#L17) | Configuration for Backup for GKE. | object({…})
| | {}
|
| [cluster_autoscaling](variables.tf#L38) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…})
| | null
|
| [deletion_protection](variables.tf#L59) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool
| | true
|
| [description](variables.tf#L66) | Cluster description. | string
| | null
|
-| [enable_addons](variables.tf#L72) | Addons enabled in the cluster (true means enabled). | object({…})
| | {…}
|
-| [enable_features](variables.tf#L95) | Enable cluster-level features. Certain features allow configuration. | object({…})
| | {…}
|
-| [issue_client_certificate](variables.tf#L141) | Enable issuing client certificate. | bool
| | false
|
-| [labels](variables.tf#L147) | Cluster resource labels. | map(string)
| | null
|
-| [logging_config](variables.tf#L158) | Logging configuration. | object({…})
| | {}
|
-| [maintenance_config](variables.tf#L179) | Maintenance window configuration. | object({…})
| | {…}
|
-| [max_pods_per_node](variables.tf#L202) | Maximum number of pods per node in this cluster. | number
| | 110
|
-| [min_master_version](variables.tf#L208) | Minimum version of the master, defaults to the version of the most recent official release. | string
| | null
|
-| [monitoring_config](variables.tf#L214) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…})
| | {}
|
-| [node_locations](variables.tf#L269) | Zones in which the cluster's nodes are located. | list(string)
| | []
|
-| [private_cluster_config](variables.tf#L276) | Private cluster configuration. | object({…})
| | null
|
-| [release_channel](variables.tf#L295) | Release channel for GKE upgrades. | string
| | null
|
-| [service_account](variables.tf#L301) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string
| | null
|
-| [tags](variables.tf#L307) | Network tags applied to nodes. | list(string)
| | null
|
+| [enable_addons](variables.tf#L72) | Addons enabled in the cluster (true means enabled). | object({…})
| | {…}
|
+| [enable_features](variables.tf#L96) | Enable cluster-level features. Certain features allow configuration. | object({…})
| | {…}
|
+| [issue_client_certificate](variables.tf#L142) | Enable issuing client certificate. | bool
| | false
|
+| [labels](variables.tf#L148) | Cluster resource labels. | map(string)
| | null
|
+| [logging_config](variables.tf#L159) | Logging configuration. | object({…})
| | {}
|
+| [maintenance_config](variables.tf#L180) | Maintenance window configuration. | object({…})
| | {…}
|
+| [max_pods_per_node](variables.tf#L203) | Maximum number of pods per node in this cluster. | number
| | 110
|
+| [min_master_version](variables.tf#L209) | Minimum version of the master, defaults to the version of the most recent official release. | string
| | null
|
+| [monitoring_config](variables.tf#L215) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…})
| | {}
|
+| [node_locations](variables.tf#L270) | Zones in which the cluster's nodes are located. | list(string)
| | []
|
+| [private_cluster_config](variables.tf#L277) | Private cluster configuration. | object({…})
| | null
|
+| [release_channel](variables.tf#L296) | Release channel for GKE upgrades. | string
| | null
|
+| [service_account](variables.tf#L302) | Service account used for the default node pool, only useful if the default GCE service account has been disabled. | string
| | null
|
+| [tags](variables.tf#L308) | Network tags applied to nodes. | list(string)
| | null
|
## Outputs
diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf
index 90115501de..f5d8fe7511 100644
--- a/modules/gke-cluster-standard/main.tf
+++ b/modules/gke-cluster-standard/main.tf
@@ -83,6 +83,9 @@ resource "google_container_cluster" "cluster" {
gcp_filestore_csi_driver_config {
enabled = var.enable_addons.gcp_filestore_csi_driver
}
+ gcs_fuse_csi_driver_config {
+ enabled = var.enable_addons.gcs_fuse_csi_driver
+ }
kalm_config {
enabled = var.enable_addons.kalm
}
diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf
index c88bb773e9..221f6b8a55 100644
--- a/modules/gke-cluster-standard/variables.tf
+++ b/modules/gke-cluster-standard/variables.tf
@@ -77,6 +77,7 @@ variable "enable_addons" {
dns_cache = optional(bool, false)
gce_persistent_disk_csi_driver = optional(bool, false)
gcp_filestore_csi_driver = optional(bool, false)
+ gcs_fuse_csi_driver = optional(bool, false)
horizontal_pod_autoscaling = optional(bool, false)
http_load_balancing = optional(bool, false)
istio = optional(object({
diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md
index 9e70b8e19d..e7d2191132 100644
--- a/modules/gke-nodepool/README.md
+++ b/modules/gke-nodepool/README.md
@@ -103,6 +103,39 @@ module "cluster-1-nodepool-1" {
}
# tftest modules=1 resources=2 inventory=config.yaml
```
+### GPU Node & node pool configuration
+
+```hcl
+module "cluster-1-nodepool-gpu-1" {
+ source = "./fabric/modules/gke-nodepool"
+ project_id = "myproject"
+ cluster_name = "cluster-1"
+ location = "europe-west4-a"
+ name = "nodepool-gpu-1"
+ labels = { environment = "dev" }
+ service_account = {
+ create = true
+ email = "nodepool-gpu-1" # optional
+ oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"]
+ }
+ node_config = {
+ machine_type = "a2-highgpu-1g"
+ disk_size_gb = 50
+ disk_type = "pd-ssd"
+ ephemeral_ssd_count = 1
+ gvnic = true
+ spot = true
+ guest_accelerator = {
+ type = "nvidia-tesla-a100"
+ count = 1
+ gpu_driver = {
+ version = "LATEST"
+ }
+ }
+ }
+}
+# tftest modules=1 resources=2 inventory=guest-accelerator.yaml
+```
## Variables
@@ -110,22 +143,22 @@ module "cluster-1-nodepool-1" {
|---|---|:---:|:---:|:---:|
| [cluster_name](variables.tf#L23) | Cluster name. | string
| ✓ | |
| [location](variables.tf#L41) | Cluster location. | string
| ✓ | |
-| [project_id](variables.tf#L150) | Cluster project id. | string
| ✓ | |
+| [project_id](variables.tf#L166) | Cluster project id. | string
| ✓ | |
| [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | string
| | null
|
| [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | string
| | null
|
| [labels](variables.tf#L34) | Kubernetes labels applied to each node. | map(string)
| | {}
|
| [max_pods_per_node](variables.tf#L46) | Maximum number of pods per node. | number
| | null
|
| [name](variables.tf#L52) | Optional nodepool name. | string
| | null
|
-| [node_config](variables.tf#L58) | Node-level configuration. | object({…})
| | {…}
|
-| [node_count](variables.tf#L97) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…})
| | {…}
|
-| [node_locations](variables.tf#L109) | Node locations. | list(string)
| | null
|
-| [nodepool_config](variables.tf#L115) | Nodepool-level configuration. | object({…})
| | null
|
-| [pod_range](variables.tf#L137) | Pod secondary range configuration. | object({…})
| | null
|
-| [reservation_affinity](variables.tf#L155) | Configuration of the desired reservation which instances could take capacity from. | object({…})
| | null
|
-| [service_account](variables.tf#L165) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…})
| | {}
|
-| [sole_tenant_nodegroup](variables.tf#L176) | Sole tenant node group. | string
| | null
|
-| [tags](variables.tf#L182) | Network tags applied to nodes. | list(string)
| | null
|
-| [taints](variables.tf#L188) | Kubernetes taints applied to all nodes. | map(object({…}))
| | {}
|
+| [node_config](variables.tf#L58) | Node-level configuration. | object({…})
| | {…}
|
+| [node_count](variables.tf#L113) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…})
| | {…}
|
+| [node_locations](variables.tf#L125) | Node locations. | list(string)
| | null
|
+| [nodepool_config](variables.tf#L131) | Nodepool-level configuration. | object({…})
| | null
|
+| [pod_range](variables.tf#L153) | Pod secondary range configuration. | object({…})
| | null
|
+| [reservation_affinity](variables.tf#L171) | Configuration of the desired reservation which instances could take capacity from. | object({…})
| | null
|
+| [service_account](variables.tf#L181) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…})
| | {}
|
+| [sole_tenant_nodegroup](variables.tf#L192) | Sole tenant node group. | string
| | null
|
+| [tags](variables.tf#L198) | Network tags applied to nodes. | list(string)
| | null
|
+| [taints](variables.tf#L204) | Kubernetes taints applied to all nodes. | map(object({…}))
| | {}
|
## Outputs
diff --git a/modules/gke-nodepool/main.tf b/modules/gke-nodepool/main.tf
index a5dff043cd..f5a104bd54 100644
--- a/modules/gke-nodepool/main.tf
+++ b/modules/gke-nodepool/main.tf
@@ -1,5 +1,5 @@
/**
- * Copyright 2022 Google LLC
+ * Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -165,7 +165,28 @@ resource "google_container_node_pool" "nodepool" {
content {
count = var.node_config.guest_accelerator.count
type = var.node_config.guest_accelerator.type
- gpu_partition_size = var.node_config.guest_accelerator.gpu_partition_size
+ gpu_partition_size = var.node_config.guest_accelerator.gpu_driver == null ? null : var.node_config.guest_accelerator.gpu_driver.partition_size
+
+ dynamic "gpu_sharing_config" {
+ for_each = var.node_config.guest_accelerator.gpu_driver != null ? [""] : []
+ content {
+ gpu_sharing_strategy = var.node_config.guest_accelerator.gpu_driver.max_shared_clients_per_gpu != null ? "TIME_SHARING" : null
+ max_shared_clients_per_gpu = var.node_config.guest_accelerator.gpu_driver.max_shared_clients_per_gpu
+ }
+ }
+
+ dynamic "gpu_driver_installation_config" {
+ for_each = var.node_config.guest_accelerator.gpu_driver != null ? [""] : []
+ content {
+ gpu_driver_version = var.node_config.guest_accelerator.gpu_driver.version
+ }
+ }
+ }
+ }
+ dynamic "local_nvme_ssd_block_config" {
+ for_each = coalesce(var.node_config.local_nvme_ssd_count, 0) > 0 ? [""] : []
+ content {
+ local_ssd_count = var.node_config.local_nvme_ssd_count
}
}
dynamic "gvnic" {
diff --git a/modules/gke-nodepool/variables.tf b/modules/gke-nodepool/variables.tf
index 4cedb26a94..17cfd88c5a 100644
--- a/modules/gke-nodepool/variables.tf
+++ b/modules/gke-nodepool/variables.tf
@@ -64,12 +64,17 @@ variable "node_config" {
ephemeral_ssd_count = optional(number)
gcfs = optional(bool, false)
guest_accelerator = optional(object({
- count = number
- type = string
- gpu_partition_size = optional(string)
+ count = number
+ type = string
+ gpu_driver = optional(object({
+ version = string
+ partition_size = optional(string)
+ max_shared_clients_per_gpu = optional(number)
+ }))
}))
- gvnic = optional(bool, false)
- image_type = optional(string)
+ local_nvme_ssd_count = optional(number)
+ gvnic = optional(bool, false)
+ image_type = optional(string)
kubelet_config = optional(object({
cpu_manager_policy = string
cpu_cfs_quota = optional(bool)
@@ -92,6 +97,17 @@ variable "node_config" {
default = {
disk_type = "pd-balanced"
}
+ validation {
+ condition = (
+ alltrue([
+ for k, v in var.node_config.guest_accelerator[*].gpu_driver : contains([
+ "GPU_DRIVER_VERSION_UNSPECIFIED", "INSTALLATION_DISABLED",
+ "DEFAULT", "LATEST"
+ ], v.version)
+ ])
+ )
+ error_message = "Invalid GPU driver version."
+ }
}
variable "node_count" {
diff --git a/tests/modules/gke_nodepool/examples/guest-accelerator.yaml b/tests/modules/gke_nodepool/examples/guest-accelerator.yaml
new file mode 100644
index 0000000000..5f125ef6f5
--- /dev/null
+++ b/tests/modules/gke_nodepool/examples/guest-accelerator.yaml
@@ -0,0 +1,42 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+values:
+ module.cluster-1-nodepool-gpu-1.google_container_node_pool.nodepool:
+ cluster: cluster-1
+ location: europe-west4-a
+ name: nodepool-gpu-1
+ node_config:
+ - boot_disk_kms_key: null
+ disk_size_gb: 50
+ disk_type: pd-ssd
+ ephemeral_storage_config:
+ - local_ssd_count: 1
+ ephemeral_storage_local_ssd_config: []
+ guest_accelerator:
+ - count: 1
+ gpu_driver_installation_config:
+ - gpu_driver_version: LATEST
+ gpu_partition_size: null
+ gpu_sharing_config:
+ - gpu_sharing_strategy: null
+ max_shared_clients_per_gpu: null
+ type: nvidia-tesla-a100
+ gvnic: []
+ machine_type: a2-highgpu-1g
+ spot: true
+ project: myproject
+
+counts:
+ google_container_node_pool: 1