diff --git a/README.md b/README.md index 93798b47..a31c33dc 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ the same as any other terraform: # assuming the directory containing main.tf is the current working directory # create/update the cluster -terraform init && terraform validate && terraform apply +terraform init && terraform validate && terraform apply -var-file="terraform.tfvars" # destroy the cluster terraform init && terraform validate && terraform apply -destroy diff --git a/a3-mega/examples/gke/main.tf b/a3-mega/examples/gke/main.tf index 9037ea4d..da098699 100644 --- a/a3-mega/examples/gke/main.tf +++ b/a3-mega/examples/gke/main.tf @@ -1,11 +1,13 @@ variable "node_pools" {} variable "project_id" {} variable "resource_prefix" {} +variable "region" {} -module "a3-gke" { +module "a3-mega-gke" { source = "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/gke" node_pools = var.node_pools project_id = var.project_id resource_prefix = var.resource_prefix + region = var.region } diff --git a/a3-mega/terraform/modules/cluster/gke/main.tf b/a3-mega/terraform/modules/cluster/gke/main.tf index 9527c65d..03d2abdb 100644 --- a/a3-mega/terraform/modules/cluster/gke/main.tf +++ b/a3-mega/terraform/modules/cluster/gke/main.tf @@ -309,7 +309,7 @@ module "kubectl-apply" { daemonsets = { device_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/cmd/nvidia_gpu/device-plugin.yaml" nvidia_driver = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml" - nccl_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-fastrak/nccl-fastrak-installer.yaml" # TODO dead link + nccl_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml" } enable = var.ksa != null ksa = var.ksa