From b91dc6a6a49d47f236c86194a2b521be514d1475 Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Wed, 6 Mar 2024 13:48:35 -0800 Subject: [PATCH 1/6] Add a3-mega support --- a3-mega/README.md | 67 ++++ a3-mega/examples/gke/README.md | 62 ++++ a3-mega/examples/gke/blueprint.yaml | 35 ++ a3-mega/examples/gke/main.tf | 11 + a3-mega/examples/mig-cos/README.md | 61 ++++ a3-mega/examples/mig-cos/blueprint.yaml | 35 ++ a3-mega/examples/mig-cos/main.tf | 13 + .../terraform/modules/cluster/gke/README.md | 72 ++++ .../cluster/gke/kubectl-apply/README.md | 52 +++ .../cluster/gke/kubectl-apply/aiinfra-ksa.tf | 92 +++++ .../cluster/gke/kubectl-apply/variables.tf | 73 ++++ .../cluster/gke/kubectl-apply/versions.tf | 34 ++ a3-mega/terraform/modules/cluster/gke/main.tf | 320 +++++++++++++++++ .../terraform/modules/cluster/gke/outputs.tf | 25 ++ .../modules/cluster/gke/variables.tf | 168 +++++++++ .../modules/cluster/mig-cos/README.md | 65 ++++ .../cluster/mig-cos/cloudinit/README.md | 51 +++ .../modules/cluster/mig-cos/cloudinit/main.tf | 209 +++++++++++ .../cluster/mig-cos/cloudinit/outputs.tf | 19 + .../aiinfra_network_storage.yaml.template | 52 +++ .../aiinfra_pull_image.yaml.template | 27 ++ .../aiinfra_start_container.yaml.template | 37 ++ .../aiinfra_startup_scripts.yaml.template | 76 ++++ .../templates/userdata.yaml.template | 18 + .../cluster/mig-cos/cloudinit/variables.tf | 97 ++++++ .../terraform/modules/cluster/mig-cos/main.tf | 101 ++++++ .../modules/cluster/mig-cos/variables.tf | 325 ++++++++++++++++++ .../terraform/modules/cluster/mig/README.md | 70 ++++ a3-mega/terraform/modules/cluster/mig/main.tf | 138 ++++++++ .../terraform/modules/cluster/mig/outputs.tf | 20 ++ .../modules/cluster/mig/variables.tf | 312 +++++++++++++++++ .../modules/common/dashboard/README.md | 55 +++ .../modules/common/dashboard/main.tf | 91 +++++ .../modules/common/dashboard/outputs.tf | 20 ++ .../modules/common/dashboard/variables.tf | 40 +++ .../common/instance_group_manager/README.md | 54 +++ .../common/instance_group_manager/main.tf | 42 +++ .../common/instance_group_manager/outputs.tf | 25 ++ .../instance_group_manager/variables.tf | 102 ++++++ .../common/instance_template/README.md | 71 ++++ .../modules/common/instance_template/main.tf | 144 ++++++++ .../common/instance_template/outputs.tf | 32 ++ .../common/instance_template/variables.tf | 275 +++++++++++++++ .../modules/common/network/README.md | 67 ++++ .../terraform/modules/common/network/main.tf | 178 ++++++++++ .../modules/common/network/outputs.tf | 55 +++ .../modules/common/network/variables.tf | 46 +++ .../modules/common/network/versions.tf | 31 ++ .../modules/common/resource_policy/README.md | 52 +++ .../modules/common/resource_policy/main.tf | 48 +++ .../modules/common/resource_policy/outputs.tf | 31 ++ .../common/resource_policy/variables.tf | 54 +++ scripts/entrypoint_helpers.sh | 3 +- .../modules/cluster/gke/input/simple.tfvars | 13 + .../terraform/modules/cluster/gke/tests.sh | 24 ++ .../cluster/mig-cos/input/simple.tfvars | 13 + .../modules/cluster/mig-cos/tests.sh | 24 ++ .../modules/cluster/mig/input/simple.tfvars | 13 + .../terraform/modules/cluster/mig/tests.sh | 24 ++ test/continuous/run.sh | 7 +- .../cluster/gke/input/gke-compact-pp.tfvars | 8 + .../cluster/gke/input/gke-existing-rp.tfvars | 8 + .../modules/cluster/gke/input/gke-gpu.tfvars | 9 + .../cluster/gke/output/gke-compact-pp.json | 91 +++++ .../cluster/gke/output/gke-existing-rp.json | 78 +++++ .../modules/cluster/gke/output/gke-gpu.json | 68 ++++ .../terraform/modules/cluster/gke/tests.sh | 68 ++++ .../cluster/mig-cos/input/existing-rp.tfvars | 31 ++ .../cluster/mig-cos/input/multi.tfvars | 24 ++ .../cluster/mig-cos/input/simple.tfvars | 19 + .../cluster/mig-cos/output/existing-rp.json | 49 +++ .../cluster/mig-cos/output/modules.json | 19 + .../cluster/mig-cos/output/multimodules.json | 30 ++ .../modules/cluster/mig-cos/tests.sh | 68 ++++ .../cluster/mig/input/existing-rp.tfvars | 20 ++ .../modules/cluster/mig/input/multi.tfvars | 13 + .../modules/cluster/mig/input/simple.tfvars | 8 + .../cluster/mig/output/existing-rp.json | 55 +++ .../modules/cluster/mig/output/modules.json | 25 ++ .../cluster/mig/output/multimodules.json | 36 ++ .../terraform/modules/cluster/mig/tests.sh | 68 ++++ .../common/dashboard/input/disable.tfvars | 3 + .../common/dashboard/input/enable.tfvars | 3 + .../modules/common/dashboard/output/data.json | 19 + .../common/dashboard/output/modules.json | 11 + .../modules/common/dashboard/tests.sh | 57 +++ .../input/simple.tfvars | 4 + .../output/resources.json | 29 ++ .../common/instance_group_manager/tests.sh | 34 ++ .../instance_template/input/simple.tfvars | 25 ++ .../instance_template/output/resources.json | 71 ++++ .../modules/common/instance_template/tests.sh | 34 ++ .../network/input/existing_network.tfvars | 5 + .../common/network/input/new_network.tfvars | 2 + .../network/output/existing_network.json | 32 ++ .../common/network/output/new_network.json | 30 ++ .../terraform/modules/common/network/tests.sh | 53 +++ .../resource_policy/input/simple.tfvars | 2 + .../resource_policy/output/resources.json | 21 ++ .../modules/common/resource_policy/tests.sh | 34 ++ test/pr/run.sh | 8 + 101 files changed, 5641 insertions(+), 2 deletions(-) create mode 100644 a3-mega/README.md create mode 100644 a3-mega/examples/gke/README.md create mode 100644 a3-mega/examples/gke/blueprint.yaml create mode 100644 a3-mega/examples/gke/main.tf create mode 100644 a3-mega/examples/mig-cos/README.md create mode 100644 a3-mega/examples/mig-cos/blueprint.yaml create mode 100644 a3-mega/examples/mig-cos/main.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/README.md create mode 100644 a3-mega/terraform/modules/cluster/gke/kubectl-apply/README.md create mode 100644 a3-mega/terraform/modules/cluster/gke/kubectl-apply/aiinfra-ksa.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/kubectl-apply/variables.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/kubectl-apply/versions.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/main.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/outputs.tf create mode 100644 a3-mega/terraform/modules/cluster/gke/variables.tf create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/README.md create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/README.md create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/main.tf create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/outputs.tf create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_network_storage.yaml.template create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_pull_image.yaml.template create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_start_container.yaml.template create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/userdata.yaml.template create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/cloudinit/variables.tf create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/main.tf create mode 100644 a3-mega/terraform/modules/cluster/mig-cos/variables.tf create mode 100644 a3-mega/terraform/modules/cluster/mig/README.md create mode 100644 a3-mega/terraform/modules/cluster/mig/main.tf create mode 100644 a3-mega/terraform/modules/cluster/mig/outputs.tf create mode 100644 a3-mega/terraform/modules/cluster/mig/variables.tf create mode 100644 a3-mega/terraform/modules/common/dashboard/README.md create mode 100644 a3-mega/terraform/modules/common/dashboard/main.tf create mode 100644 a3-mega/terraform/modules/common/dashboard/outputs.tf create mode 100644 a3-mega/terraform/modules/common/dashboard/variables.tf create mode 100644 a3-mega/terraform/modules/common/instance_group_manager/README.md create mode 100644 a3-mega/terraform/modules/common/instance_group_manager/main.tf create mode 100644 a3-mega/terraform/modules/common/instance_group_manager/outputs.tf create mode 100644 a3-mega/terraform/modules/common/instance_group_manager/variables.tf create mode 100644 a3-mega/terraform/modules/common/instance_template/README.md create mode 100644 a3-mega/terraform/modules/common/instance_template/main.tf create mode 100644 a3-mega/terraform/modules/common/instance_template/outputs.tf create mode 100644 a3-mega/terraform/modules/common/instance_template/variables.tf create mode 100644 a3-mega/terraform/modules/common/network/README.md create mode 100644 a3-mega/terraform/modules/common/network/main.tf create mode 100644 a3-mega/terraform/modules/common/network/outputs.tf create mode 100644 a3-mega/terraform/modules/common/network/variables.tf create mode 100644 a3-mega/terraform/modules/common/network/versions.tf create mode 100644 a3-mega/terraform/modules/common/resource_policy/README.md create mode 100644 a3-mega/terraform/modules/common/resource_policy/main.tf create mode 100644 a3-mega/terraform/modules/common/resource_policy/outputs.tf create mode 100644 a3-mega/terraform/modules/common/resource_policy/variables.tf create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/gke/input/simple.tfvars create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/gke/tests.sh create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/mig-cos/tests.sh create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars create mode 100644 test/continuous/a3-mega/terraform/modules/cluster/mig/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-compact-pp.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-existing-rp.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-gpu.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-compact-pp.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-existing-rp.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-gpu.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/gke/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/existing-rp.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/multi.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/existing-rp.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/modules.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/multimodules.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig-cos/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/input/existing-rp.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/input/multi.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/output/existing-rp.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/output/modules.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/output/multimodules.json create mode 100644 test/pr/a3-mega/terraform/modules/cluster/mig/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/common/dashboard/input/disable.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/dashboard/input/enable.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/dashboard/output/data.json create mode 100644 test/pr/a3-mega/terraform/modules/common/dashboard/output/modules.json create mode 100644 test/pr/a3-mega/terraform/modules/common/dashboard/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_group_manager/input/simple.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_group_manager/output/resources.json create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_group_manager/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_template/input/simple.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_template/output/resources.json create mode 100644 test/pr/a3-mega/terraform/modules/common/instance_template/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/common/network/input/existing_network.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/network/input/new_network.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/network/output/existing_network.json create mode 100644 test/pr/a3-mega/terraform/modules/common/network/output/new_network.json create mode 100644 test/pr/a3-mega/terraform/modules/common/network/tests.sh create mode 100644 test/pr/a3-mega/terraform/modules/common/resource_policy/input/simple.tfvars create mode 100644 test/pr/a3-mega/terraform/modules/common/resource_policy/output/resources.json create mode 100644 test/pr/a3-mega/terraform/modules/common/resource_policy/tests.sh diff --git a/a3-mega/README.md b/a3-mega/README.md new file mode 100644 index 000000000..02e22fe81 --- /dev/null +++ b/a3-mega/README.md @@ -0,0 +1,67 @@ +# Overview + +## Control Plane Options + +A3-Mega clusters may be created through either [GKE](https://cloud.google.com/kubernetes-engine) or a [MIG](https://cloud.google.com/compute/docs/instance-groups#managed_instance_groups) via the modules found [here](./terraform/modules/cluster). Due to the recency of A3-Mega's release, features are limited in each control plane, and those limitations are listed below. + +| Feature \ Module | `gke` | `mig-cos` | +| --- | --- | --- | +| [VM Image](https://cloud.google.com/compute/docs/images) | [COS-Cloud](https://cloud.google.com/container-optimized-os/docs) | [COS-Cloud](https://cloud.google.com/container-optimized-os/docs) | +| [Compact placement policy](https://cloud.google.com/compute/docs/instances/define-instance-placement) | Yes | Yes | +| [Kubernetes](https://kubernetes.io/) support | Yes | No | + +## Quickstart with `gke` + +An A3-Mega cluster of eight nodes (two node pools with four nodes each) booting with a COS-Cloud image can be created via GKE by running the following two commands: + +```bash +cat >./terraform.tfvars <./terraform.tfvars < +## Requirements + +No requirements. + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [a3-mega-gke](#module\_a3-mega-gke) | github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/gke | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [node\_pools](#input\_node\_pools) | n/a | `any` | n/a | yes | +| [project\_id](#input\_project\_id) | n/a | `any` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | n/a | `any` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/a3-mega/examples/gke/blueprint.yaml b/a3-mega/examples/gke/blueprint.yaml new file mode 100644 index 000000000..86372a8d7 --- /dev/null +++ b/a3-mega/examples/gke/blueprint.yaml @@ -0,0 +1,35 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: a3-mega-gke + +vars: + deployment_name: a3-mega-gke + + node_pools: + - node_count: 4 + zone: us-east4-a + - node_count: 4 + zone: us-east4-a + project_id: my-project-id + region: us-east4 + resource_prefix: my-cluster-name + +deployment_groups: +- group: primary + modules: + - id: a3-mega-gke + source: "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/gke" diff --git a/a3-mega/examples/gke/main.tf b/a3-mega/examples/gke/main.tf new file mode 100644 index 000000000..9037ea4d3 --- /dev/null +++ b/a3-mega/examples/gke/main.tf @@ -0,0 +1,11 @@ +variable "node_pools" {} +variable "project_id" {} +variable "resource_prefix" {} + +module "a3-gke" { + source = "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/gke" + + node_pools = var.node_pools + project_id = var.project_id + resource_prefix = var.resource_prefix +} diff --git a/a3-mega/examples/mig-cos/README.md b/a3-mega/examples/mig-cos/README.md new file mode 100644 index 000000000..309c9f53a --- /dev/null +++ b/a3-mega/examples/mig-cos/README.md @@ -0,0 +1,61 @@ +# The cluster + +This configuration creates two Managed Instance Groups of four +[`a3-megagpu-8g`](https://cloud.google.com/blog/products/compute/introducing-a3-supercomputers-with-nvidia-h100-gpus) +VM instances each (eight instances in total). Each instance has: +- eight [NVidia H100 GPUs](https://www.nvidia.com/en-us/data-center/h100/), +- nine [NICs](https://cloud.google.com/vpc/docs/multiple-interfaces-concepts) + (one VPC for the host network and eight dedicated to the GPUs), +- a [COS-Cloud](https://cloud.google.com/container-optimized-os/docs) machine + image, +- TCPX, Nvidia GPU drivers, and NCCL plugin installed + +# The tfvars file + +The `terraform.tfvars` file is what configures the cluster. Detailed +descriptions of each variable can be found in +[this `README`](../../terraform/modules/cluster/mig-cos/README.md). +All optional variables may be omitted to use their default values. + +Required variables: +- `instance_groups` +- `project_id` +- `region` +- `resource_prefix` + +# How to create this cluster + +Refer to [this section](../../../README.md#how-to-provision-a-cluster). + + +## Requirements + +No requirements. + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [a3-mig-cos](#module\_a3-mega-mig-cos) | github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/mig-cos | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [instance\_groups](#input\_instance\_groups) | n/a | `any` | n/a | yes | +| [project\_id](#input\_project\_id) | n/a | `any` | n/a | yes | +| [region](#input\_region) | n/a | `any` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | n/a | `any` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/a3-mega/examples/mig-cos/blueprint.yaml b/a3-mega/examples/mig-cos/blueprint.yaml new file mode 100644 index 000000000..68fe83565 --- /dev/null +++ b/a3-mega/examples/mig-cos/blueprint.yaml @@ -0,0 +1,35 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: a3-mega-mig-cos + +vars: + deployment_name: a3-mega-mig-cos + + instance_groups: + - target_size: 4 + zone: us-east4-a + - target_size: 4 + zone: us-east4-a + project_id: my-project-id + region: us-east4 + resource_prefix: my-cluster-name + +deployment_groups: +- group: primary + modules: + - id: a3-mega-mig-cos + source: "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/mig-cos" diff --git a/a3-mega/examples/mig-cos/main.tf b/a3-mega/examples/mig-cos/main.tf new file mode 100644 index 000000000..a4a253b76 --- /dev/null +++ b/a3-mega/examples/mig-cos/main.tf @@ -0,0 +1,13 @@ +variable "instance_groups" {} +variable "project_id" {} +variable "region" {} +variable "resource_prefix" {} + +module "a3-mig-cos" { + source = "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/mig-cos" + + instance_groups = var.instance_groups + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix +} diff --git a/a3-mega/terraform/modules/cluster/gke/README.md b/a3-mega/terraform/modules/cluster/gke/README.md new file mode 100644 index 000000000..59617834c --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/README.md @@ -0,0 +1,72 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | n/a | +| [google-beta](#provider\_google-beta) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [dashboard](#module\_dashboard) | ../../common/dashboard | n/a | +| [kubectl-apply](#module\_kubectl-apply) | ./kubectl-apply | n/a | +| [network](#module\_network) | ../../common/network | n/a | +| [resource\_policy](#module\_resource\_policy) | ../../common/resource_policy | n/a | + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_container_cluster.cluster](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_cluster) | resource | +| [google-beta_google_container_node_pool.node-pools](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_container_node_pool) | resource | +| [google_project_iam_member.node_service_account_logWriter](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [google_project_iam_member.node_service_account_metricWriter](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [google_project_iam_member.node_service_account_monitoringViewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [google_client_config.current](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | +| [google_compute_default_service_account.account](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_container_engine_versions.gkeversion](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_engine_versions) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB. Defaults to 200GB.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-size). | `number` | `200` | no | +| [disk\_type](#input\_disk\_type) | Type of the disk attached to each node. The default disk type is 'pd-standard'

Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-type). | `string` | `"pd-ssd"` | no | +| [enable\_gke\_dashboard](#input\_enable\_gke\_dashboard) | Flag to enable GPU usage dashboards for the GKE cluster. | `bool` | `true` | no | +| [gke\_version](#input\_gke\_version) | The GKE version to be used as the minimum version of the master. The default value for that is latest master version.
More details can be found [here](https://cloud.google.com/kubernetes-engine/versioning#specifying_cluster_version)

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--name). | `string` | `null` | no | +| [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for host\_maintenance\_interval. This enables using stable fleet VM. | `string` | `"PERIODIC"` | no | +| [ksa](#input\_ksa) | The configuration for setting up Kubernetes Service Account (KSA) after GKE
cluster is created. Disable by setting to null.

- `name`: The KSA name to be used for Pods
- `namespace`: The KSA namespace to be used for Pods

Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) |
object({
name = string
namespace = string
})
|
{
"name": "aiinfra-gke-sa",
"namespace": "default"
}
| no | +| [network\_existing](#input\_network\_existing) | Existing network to attach to nic0. Setting to null will create a new network for it. |
object({
network_name = string
subnetwork_name = string
})
| `null` | no | +| [node\_pools](#input\_node\_pools) | The list of node pools for the GKE cluster.
- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)
- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)
- `machine_type`: (Optional) The machine type for the node pool. Only supported machine types are 'a3-highgpu-8g' and 'a2-highgpu-1g'. [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#machine_type)
- `compact_placement_policy`:(Optional) The object for superblock level compact placement policy for the instances. Currently only 1 resource policy is supported. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name)
- `new_policy`: (Optional) Flag for creating a new resource policy.
- `existing_policy_name`: (Optional) The existing resource policy. |
list(object({
zone = string,
node_count = number,
machine_type = optional(string, "a3-highgpu-8g"),
compact_placement_policy = optional(object({
new_policy = optional(bool, false)
existing_policy_name = optional(string)
specific_reservation = optional(string)
}))
}))
| `[]` | no | +| [node\_service\_account](#input\_node\_service\_account) | The service account to be used by the Node VMs. If not specified, the "default" service account is used.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_node_config), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--service-account). | `string` | `null` | no | +| [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes | +| [region](#input\_region) | The region in which the cluster master will be created. The cluster will be a regional cluster with multiple masters spread across zones in the region, and with default node locations in those zones as well. | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [id](#output\_id) | Google Kubernetes cluster id | +| [name](#output\_name) | Google Kubernetes cluster name | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/cluster/gke/kubectl-apply/README.md b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/README.md new file mode 100644 index 000000000..14741a120 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/README.md @@ -0,0 +1,52 @@ + +## License + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13 | +| [http](#requirement\_http) | >= 3.3 | +| [kubectl](#requirement\_kubectl) | >= 1.7.0 | +| [kubernetes](#requirement\_kubernetes) | ~> 2.10 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | n/a | +| [http](#provider\_http) | >= 3.3 | +| [kubectl](#provider\_kubectl) | >= 1.7.0 | +| [kubernetes](#provider\_kubernetes) | ~> 2.10 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_service_account_iam_binding.default-workload-identity](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/service_account_iam_binding) | resource | +| [kubectl_manifest.installer_daemonsets](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubernetes_service_account.ksa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | +| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | +| [http_http.installer_daemonsets](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [cluster\_id](#input\_cluster\_id) | An identifier for the resource with format projects//locations//clusters/. | `string` | n/a | yes | +| [daemonsets](#input\_daemonsets) | Daemonsets to install with kubectl apply -f | `map(string)` | n/a | yes | +| [enable](#input\_enable) | This module cannot have for\_each, count, or depends\_on attributes because
it contains provider blocks. Conditionally enable this moduel by setting
this variable. | `bool` | n/a | yes | +| [gcp\_sa](#input\_gcp\_sa) | Google Cloud Platform service account email to which the
Kubernetes Service Account (KSA) will be bound. | `string` | n/a | yes | +| [ksa](#input\_ksa) | The configuration for setting up Kubernetes Service Account (KSA) after GKE
cluster is created.

- `name`: The KSA name to be used for Pods
- `namespace`: The KSA namespace to be used for Pods

Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) |
object({
name = string
namespace = string
})
| n/a | yes | +| [project\_id](#input\_project\_id) | Name of the project to use for instantiating clusters. | `string` | n/a | yes | + +## Outputs + +No outputs. + \ No newline at end of file diff --git a/a3-mega/terraform/modules/cluster/gke/kubectl-apply/aiinfra-ksa.tf b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/aiinfra-ksa.tf new file mode 100644 index 000000000..572dbfec4 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/aiinfra-ksa.tf @@ -0,0 +1,92 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + split_cluster_id = split("/", var.cluster_id) + kube_host = var.enable ? ( + "https://${data.google_container_cluster.gke_cluster[0].endpoint}" + ) : "" + kube_cert = var.enable ? base64decode( + data.google_container_cluster.gke_cluster[0].master_auth.0.cluster_ca_certificate + ) : "" +} + +data "google_container_cluster" "gke_cluster" { + count = var.enable ? 1 : 0 + + project = var.project_id + name = local.split_cluster_id[5] + location = local.split_cluster_id[3] +} + +data "google_client_config" "default" {} + +provider "kubernetes" { + host = local.kube_host + cluster_ca_certificate = local.kube_cert + token = data.google_client_config.default.access_token +} + +provider "kubectl" { + host = local.kube_host + cluster_ca_certificate = local.kube_cert + token = data.google_client_config.default.access_token + load_config_file = false +} + +// Creating and Annotating KSA with google service account +resource "kubernetes_service_account" "ksa" { + count = var.enable ? 1 : 0 + + automount_service_account_token = false + metadata { + name = var.ksa.name + namespace = var.ksa.namespace + annotations = { + "iam.gke.io/gcp-service-account" = var.gcp_sa + } + } + + depends_on = [data.google_container_cluster.gke_cluster] +} + +// Binding KSA to google service account. +resource "google_service_account_iam_binding" "default-workload-identity" { + count = var.enable ? 1 : 0 + + service_account_id = "projects/${var.project_id}/serviceAccounts/${var.gcp_sa}" + role = "roles/iam.workloadIdentityUser" + members = [ + "serviceAccount:${var.project_id}.svc.id.goog[${var.ksa.namespace}/${var.ksa.name}]", + ] + + depends_on = [resource.kubernetes_service_account.ksa] +} + +data "http" "installer_daemonsets" { + for_each = var.enable ? var.daemonsets : {} + + url = each.value +} + +resource "kubectl_manifest" "installer_daemonsets" { + for_each = var.enable ? var.daemonsets : {} + + yaml_body = data.http.installer_daemonsets[each.key].response_body + wait_for_rollout = false + + depends_on = [resource.google_service_account_iam_binding.default-workload-identity] +} diff --git a/a3-mega/terraform/modules/cluster/gke/kubectl-apply/variables.tf b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/variables.tf new file mode 100644 index 000000000..21ee693c2 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/variables.tf @@ -0,0 +1,73 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "cluster_id" { + description = "An identifier for the resource with format projects//locations//clusters/." + type = string + nullable = false +} + +variable "daemonsets" { + description = "Daemonsets to install with kubectl apply -f " + type = map(string) + nullable = false + + validation { + condition = length(var.daemonsets) != 0 + error_message = "must specify at least one daemonset" + } +} + +variable "enable" { + description = <<-EOT + This module cannot have for_each, count, or depends_on attributes because + it contains provider blocks. Conditionally enable this moduel by setting + this variable. + EOT + type = bool + nullable = false +} + +variable "ksa" { + description = <<-EOT + The configuration for setting up Kubernetes Service Account (KSA) after GKE + cluster is created. + + - `name`: The KSA name to be used for Pods + - `namespace`: The KSA namespace to be used for Pods + + Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) + EOT + type = object({ + name = string + namespace = string + }) +} + +variable "gcp_sa" { + description = <<-EOT + Google Cloud Platform service account email to which the + Kubernetes Service Account (KSA) will be bound. + EOT + type = string + nullable = false +} + +variable "project_id" { + description = "Name of the project to use for instantiating clusters." + type = string + nullable = false +} diff --git a/a3-mega/terraform/modules/cluster/gke/kubectl-apply/versions.tf b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/versions.tf new file mode 100644 index 000000000..ddfb38fef --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/kubectl-apply/versions.tf @@ -0,0 +1,34 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_version = ">= 0.13" + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.10" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.7.0" + } + http = { + source = "hashicorp/http" + version = ">= 3.3" + } + } +} diff --git a/a3-mega/terraform/modules/cluster/gke/main.tf b/a3-mega/terraform/modules/cluster/gke/main.tf new file mode 100644 index 000000000..8bf307abe --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/main.tf @@ -0,0 +1,320 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + gke_master_version = var.gke_version != null ? var.gke_version : data.google_container_engine_versions.gkeversion.latest_master_version + node_service_account = var.node_service_account == null ? data.google_compute_default_service_account.account.email : var.node_service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/dataaccessauditlogging", + ] +} + +data "google_compute_default_service_account" "account" { + project = var.project_id +} + +data "google_client_config" "current" {} + +data "google_container_engine_versions" "gkeversion" { + location = var.region + project = var.project_id +} + +module "network" { + source = "../../common/network" + + nic0_existing = var.network_existing + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix +} + +module "dashboard" { + source = "../../common/dashboard" + count = var.enable_gke_dashboard ? 1 : 0 + + enable_gce_gke_gpu_utilization_widgets = true + enable_nvidia_dcgm_widgets = true + enable_nvidia_nvml_widgets = true + project_id = var.project_id + resource_prefix = var.resource_prefix +} + +module "resource_policy" { + source = "../../common/resource_policy" + for_each = { + for idx, node_pool in var.node_pools : "np-${idx}" => node_pool + if node_pool.compact_placement_policy != null + } + project_id = var.project_id + new_resource_policy_name = each.value.compact_placement_policy.new_policy ? "${var.resource_prefix}-${each.key}" : null + existing_resource_policy_name = each.value.compact_placement_policy.existing_policy_name + region = var.region +} + +# Definition of the private GKE cluster. +resource "google_container_cluster" "cluster" { + provider = google-beta + + project = var.project_id + name = var.resource_prefix + location = var.region + + # We need to explicitly manage the node pool to enable features such as + # auto-upgrade and auto-scaling, but we can't create a cluster with no node + # pool defined. So we create the smallest possible default node pool and + # immediately delete it. This is a best-practice suggested in the Terraform + # documentation for the container_cluster resource. + remove_default_node_pool = true + initial_node_count = 1 + min_master_version = local.gke_master_version + deletion_protection = false + + network = module.network.network_self_links[0] + subnetwork = module.network.subnetwork_self_links[0] + + master_authorized_networks_config { + } + + master_auth { + client_certificate_config { + issue_client_certificate = false + } + } + + # Enable shielded nodes to meet go/gke-cluster-pattern#req1.1.5 + enable_shielded_nodes = true + + cluster_autoscaling { + autoscaling_profile = "OPTIMIZE_UTILIZATION" + # The name of this attribute is very misleading, it controls node + # autoprovisioning (NAP), not autoscaling. + enabled = false + } + + network_policy { + # Enabling NetworkPolicy for clusters with DatapathProvider=ADVANCED_DATAPATH + # is not allowed. Dataplane V2 will take care of network policy enforcement + # instead. + enabled = false + # GKE Dataplane V2 support. This must be set to PROVIDER_UNSPECIFIED in + # order to let the datapath_provider take effect. + # https://github.com/terraform-google-modules/terraform-google-kubernetes-engine/issues/656#issuecomment-720398658 + provider = "PROVIDER_UNSPECIFIED" + } + + # This change will also enable the metadata server on nodes. + # go/gke-cluster-pattern#req4.1.1#req1.1.5 (parts of, vTPM is another section) + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + + authenticator_groups_config { + # Contact safer-gcp to get your group whitelisted for access. + # Beta feaure: don't depend on it for breakglass access. + security_group = "gke-security-groups@google.com" + } + + datapath_provider = "ADVANCED_DATAPATH" + networking_mode = "VPC_NATIVE" + ip_allocation_policy { + cluster_ipv4_cidr_block = "/14" + services_ipv4_cidr_block = "/20" + } + + + release_channel { + channel = "UNSPECIFIED" + } + + addons_config { + gce_persistent_disk_csi_driver_config { + enabled = true + } + gcs_fuse_csi_driver_config { + enabled = true + } + } + + # enable multi-NIC network + enable_multi_networking = true + + lifecycle { + # Ignore all changes to the default node pool. It's being removed + # after creation anyway. + ignore_changes = [ + node_config + ] + } + + logging_service = "logging.googleapis.com/kubernetes" + monitoring_service = "monitoring.googleapis.com/kubernetes" + + timeouts { + create = "120m" + update = "120m" + } +} + +# We define explicit node pools, so that it can be modified without +# having to destroy the entire cluster. +resource "google_container_node_pool" "node-pools" { + provider = google-beta + count = length(var.node_pools) + + project = var.project_id + name = "np-${count.index}" + cluster = google_container_cluster.cluster.id + node_locations = [var.node_pools[count.index].zone] + node_count = var.node_pools[count.index].node_count + + upgrade_settings { + max_surge = 0 + max_unavailable = 1 + } + + management { + auto_repair = true + # disabling auto_upgrade to stop automatic upgrade during customer workload execution. + auto_upgrade = false + } + + node_config { + service_account = local.node_service_account + machine_type = var.node_pools[count.index].machine_type + image_type = "COS_CONTAINERD" + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + + ephemeral_storage_local_ssd_config { + local_ssd_count = 16 + } + + shielded_instance_config { + enable_secure_boot = true + enable_integrity_monitoring = true + } + + gvnic { + enabled = true + } + + # Implied by Workload Identity + workload_metadata_config { + mode = "GKE_METADATA" + } + + # Implied by workload identity. + metadata = { + "disable-legacy-endpoints" = "true" + } + + labels = { + "cloud.google.com/gke-kdump-enabled" = "true" + } + + dynamic "host_maintenance_policy" { + for_each = var.host_maintenance_interval != null ? [1] : [] + content { + maintenance_interval = var.host_maintenance_interval + } + } + + dynamic "reservation_affinity" { + for_each = try( + var.node_pools[count.index].compact_placement_policy.specific_reservation, + null + ) != null ? [ + var.node_pools[count.index].compact_placement_policy.specific_reservation + ] : [] + content { + consume_reservation_type = "SPECIFIC_RESERVATION" + key = "compute.googleapis.com/reservation-name" + values = [reservation_affinity.value] + } + } + + oauth_scopes = local.oauth_scopes + } + + network_config { + dynamic "additional_node_network_configs" { + for_each = toset(range(1, length(module.network.network_names))) + iterator = id + content { + network = module.network.network_names[id.value] + subnetwork = module.network.subnetwork_names[id.value] + } + } + } + + dynamic "placement_policy" { + for_each = var.node_pools[count.index].compact_placement_policy != null ? [1] : [] + content { + type = "COMPACT" + policy_name = module.resource_policy["np-${count.index}"].resource_name + } + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + timeouts { + create = "10m" + update = "10m" + } +} + + +# For container logs to show up under Cloud Logging and GKE metrics to show up +# on Cloud Monitoring console, some project level roles are needed for the +# node_service_account +resource "google_project_iam_member" "node_service_account_logWriter" { + project = var.project_id + role = "roles/logging.logWriter" + member = "serviceAccount:${local.node_service_account}" +} + +resource "google_project_iam_member" "node_service_account_metricWriter" { + project = var.project_id + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${local.node_service_account}" +} + +resource "google_project_iam_member" "node_service_account_monitoringViewer" { + project = var.project_id + role = "roles/monitoring.viewer" + member = "serviceAccount:${local.node_service_account}" +} + +module "kubectl-apply" { + source = "./kubectl-apply" + + cluster_id = resource.google_container_cluster.cluster.id + daemonsets = { + device_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/cmd/nvidia_gpu/device-plugin.yaml" + nvidia_driver = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml" + nccl_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-fastrak/nccl-fastrak-installer.yaml" # TODO dead link + } + enable = var.ksa != null + ksa = var.ksa + gcp_sa = local.node_service_account + project_id = var.project_id +} diff --git a/a3-mega/terraform/modules/cluster/gke/outputs.tf b/a3-mega/terraform/modules/cluster/gke/outputs.tf new file mode 100644 index 000000000..9dd8cfed6 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/outputs.tf @@ -0,0 +1,25 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "name" { + value = resource.google_container_cluster.cluster.name + description = "Google Kubernetes cluster name" +} + +output "id" { + value = resource.google_container_cluster.cluster.id + description = "Google Kubernetes cluster id" +} diff --git a/a3-mega/terraform/modules/cluster/gke/variables.tf b/a3-mega/terraform/modules/cluster/gke/variables.tf new file mode 100644 index 000000000..68d2d70e2 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/gke/variables.tf @@ -0,0 +1,168 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "disk_size_gb" { + description = <<-EOT + Size of the disk attached to each node, specified in GB. The smallest allowed disk size is 10GB. Defaults to 200GB. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-size). + EOT + type = number + default = 200 + nullable = false +} + +variable "disk_type" { + description = <<-EOT + Type of the disk attached to each node. The default disk type is 'pd-standard' + + Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]` + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-type). + EOT + type = string + default = "pd-ssd" + nullable = false +} + +variable "enable_gke_dashboard" { + description = <<-EOT + Flag to enable GPU usage dashboards for the GKE cluster. + EOT + type = bool + default = true + nullable = false +} + +variable "gke_version" { + description = <<-EOT + The GKE version to be used as the minimum version of the master. The default value for that is latest master version. + More details can be found [here](https://cloud.google.com/kubernetes-engine/versioning#specifying_cluster_version) + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--name). + EOT + type = string + default = null +} + +variable "host_maintenance_interval" { + description = "Specifies the frequency of planned maintenance events. 'PERIODIC' is the only supported value for host_maintenance_interval." + type = string + default = "PERIODIC" + validation { + condition = var.host_maintenance_interval != null ? contains( + ["PERIODIC"], + var.host_maintenance_interval, + ) : true + error_message = "'PERIODIC' is the only supported value for host_maintenance_interval." + } +} + +variable "ksa" { + description = <<-EOT + The configuration for setting up Kubernetes Service Account (KSA) after GKE + cluster is created. Disable by setting to null. + + - `name`: The KSA name to be used for Pods + - `namespace`: The KSA namespace to be used for Pods + + Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) + EOT + type = object({ + name = string + namespace = string + }) + default = { + name = "aiinfra-gke-sa" + namespace = "default" + } +} + +variable "network_existing" { + description = "Existing network to attach to nic0. Setting to null will create a new network for it." + type = object({ + network_name = string + subnetwork_name = string + }) + default = null +} + +variable "node_pools" { + description = <<-EOT + The list of node pools for the GKE cluster. + - `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations) + - `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count) + - `machine_type`: (Optional) The machine type for the node pool. Only supported machine types are 'a3-highgpu-8g' and 'a2-highgpu-1g'. [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#machine_type) + - `compact_placement_policy`:(Optional) The object for superblock level compact placement policy for the instances. Currently only 1 resource policy is supported. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name) + - `new_policy`: (Optional) Flag for creating a new resource policy. + - `existing_policy_name`: (Optional) The existing resource policy. + EOT + type = list(object({ + zone = string, + node_count = number, + machine_type = optional(string, "a3-megagpu-8g"), + compact_placement_policy = optional(object({ + new_policy = optional(bool, false) + existing_policy_name = optional(string) + specific_reservation = optional(string) + })) + })) + default = [] + nullable = false + + validation { + condition = length(var.node_pools) != 0 + error_message = "must be non-empty list" + } + + validation { + condition = alltrue([ + for rp in var.node_pools[*].compact_placement_policy + : rp != null ? ( + rp.new_policy != (rp.existing_policy_name != null || rp.specific_reservation != null) + ) : true + ]) + error_message = "must specify exactly one of `new_compact` or `existing_name`" + } +} + +variable "node_service_account" { + description = <<-EOT + The service account to be used by the Node VMs. If not specified, the "default" service account is used. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_node_config), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--service-account). + EOT + type = string + default = null +} + +variable "project_id" { + description = "GCP Project ID to which the cluster will be deployed." + type = string + nullable = false +} + +variable "region" { + description = "The region in which the cluster master will be created. The cluster will be a regional cluster with multiple masters spread across zones in the region, and with default node locations in those zones as well." + type = string + nullable = false +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string + nullable = false +} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/README.md b/a3-mega/terraform/modules/cluster/mig-cos/README.md new file mode 100644 index 000000000..e65e9bd39 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/README.md @@ -0,0 +1,65 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [cloudinit](#module\_cloudinit) | ./cloudinit | n/a | +| [compute\_instance\_group\_manager](#module\_compute\_instance\_group\_manager) | ../../common/instance_group_manager | n/a | +| [compute\_instance\_template](#module\_compute\_instance\_template) | ../../common/instance_template | n/a | +| [filestore](#module\_filestore) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore// | v1.17.0 | +| [network](#module\_network) | ../../common/network | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [container](#input\_container) | Container image to start on boot on each instance. All `local_mount`s found in `filestore_new` and `gcsfuse_existing` will be visible within the container.

Attributes:
- `image`: docker image which will get pulled and started at boot on each instance (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/build/#tag)).
- `cmd`: arguments to the entrypoint of the docker image (Related docs: [docker](https://docs.docker.com/engine/reference/builder/#cmd)). Defaults to `[]`.
- `run_at_boot`: automatically start a container on each instance when they are created (will still pull image at boot when set to `false`). Defaults to `true`.
- `run_options`: the additional options to pass during docker run.
- `custom`: any other `docker run` options (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/run/#options)). `docker run` flags already added (`container.run_options.custom` will be appended to this list): `--detach --hostname $(hostname) --ipc host --name aiinfra --network host --privileged --restart always`. Defaults to `[]`.
- `enable_cloud_logging`: the flag to enable GCP cloud logging (`--log-driver=gcplogs`) for the containers (Related docs: [docker](https://cloud.google.com/community/tutorials/docker-gcplogs-driver)). Defaults to `false`.
- `env`: environment variables for the docker container (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/run/#env)). Defaults to `{}`. |
object({
image = string
cmd = string
run_at_boot = bool
run_options = object({
custom = list(string)
enable_cloud_logging = bool
env = map(string)
})
})
| `null` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | The size of the image in gigabytes for the boot disk of each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). | `number` | `128` | no | +| [disk\_type](#input\_disk\_type) | The GCE disk type for the boot disk of each instance.

Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). | `string` | `"pd-ssd"` | no | +| [enable\_install\_gpu](#input\_enable\_install\_gpu) | Setting this to false will disable a built-in startup script which:
- installs GPU drivers
- configures docker auth
- installs iptable rules
- installs NCCL and GPUDirectTCPX plugin

Any installation replacements should be in the startup\_script variable | `bool` | `true` | no | +| [filestore\_new](#input\_filestore\_new) | Configurations to mount newly created network storage. Each object describes NFS file-servers to be hosted in Filestore.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#inputs).

------------
`filestore_new.filestore_tier`

The service tier of the instance.

Possible values: `["BASIC_HDD", "BASIC_SSD", "HIGH_SCALE_SSD", "ENTERPRISE"]`.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_filestore_tier), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--tier).

------------
`filestore_new.local_mount`

Mountpoint for this filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount).

------------
`filestore_new.size_gb`

Storage size of the filestore instance in GB.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--file-share).

------------
`filestore_new.zone`

Location for filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_zone). |
list(object({
filestore_tier = string
local_mount = string
size_gb = number
zone = string
}))
| `[]` | no | +| [gcsfuse\_existing](#input\_gcsfuse\_existing) | Configurations to mount existing network storage. Each object describes Cloud Storage Buckets to be mounted with Cloud Storage FUSE.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#inputs).

------------
`gcsfuse_existing.local_mount`

The mount point where the contents of the device may be accessed after mounting.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_local_mount).

------------
`gcsfuse_existing.remote_mount`

Bucket name without “gs://”.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_remote_mount). |
list(object({
local_mount = string
remote_mount = string
}))
| `[]` | no | +| [instance\_groups](#input\_instance\_groups) | Required Fields:
- `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size).
- `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone).
- `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type).
- `existing_resource_policy_name`: (Optional) The existing resource policy. |
list(object({
zone = string
target_size = number
machine_type = optional(string, "a3-highgpu-8g")
existing_resource_policy_name = optional(string, null)
}))
| n/a | yes | +| [labels](#input\_labels) | The resource labels (a map of key/value pairs) to be applied to the GPU cluster.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). | `map(string)` | `{}` | no | +| [machine\_image](#input\_machine\_image) | The image with which this disk will initialize. This image must be in the project `cos-cloud`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image).

------------
`machine_image.family`

The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family).

------------
`machine_image.name`

The name of a specific image. Conflicts with `machine_image.family`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image).

------------
`machine_image.project`

The project\_id to which this image belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). |
object({
family = string
name = string
project = string
})
|
{
"family": "cos-stable",
"name": null,
"project": "cos-cloud"
}
| no | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for maintenance\_interval.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#maintenance_interval). | `string` | `null` | no | +| [metadata](#input\_metadata) | GCE metadata to attach to each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). | `map(string)` | `{}` | no | +| [network\_existing](#input\_network\_existing) | Existing network to attach to nic0. Setting to null will create a new network for it. |
object({
network_name = string
subnetwork_name = string
})
| `null` | no | +| [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes | +| [region](#input\_region) | The region in which all instances will reside. | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | +| [service\_account](#input\_service\_account) | Service account to attach to the instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account).

------------
`service_account.email`

The service account e-mail address. If not given, the default Google Compute Engine service account is used.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account).

------------
`service_account.scopes`

A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [startup\_script](#input\_startup\_script) | Shell script -- the actual script (not the filename). | `string` | `null` | no | +| [use\_compact\_placement\_policy](#input\_use\_compact\_placement\_policy) | The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy. | `bool` | `false` | no | +| [wait\_for\_instances](#input\_wait\_for\_instances) | Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). | `bool` | `true` | no | + +## Outputs + +No outputs. + \ No newline at end of file diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/README.md b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/README.md new file mode 100644 index 000000000..bbfe8b5df --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/README.md @@ -0,0 +1,51 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [cloudinit](#provider\_cloudinit) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [cloudinit_config.config](https://registry.terraform.io/providers/hashicorp/cloudinit/latest/docs/data-sources/config) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [container](#input\_container) | n/a |
object({
image = string
cmd = string
run_at_boot = bool
run_options = object({
custom = list(string)
enable_cloud_logging = bool
env = map(string)
})
})
| n/a | yes | +| [enable\_install\_gpu](#input\_enable\_install\_gpu) | n/a | `bool` | n/a | yes | +| [filestores](#input\_filestores) | n/a |
list(object({
local_mount = string
remote_mount = string
}))
| n/a | yes | +| [gcsfuses](#input\_gcsfuses) | n/a |
list(object({
local_mount = string
remote_mount = string
}))
| n/a | yes | +| [startup\_script](#input\_startup\_script) | n/a | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [user-data](#output\_user-data) | n/a | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/main.tf b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/main.tf new file mode 100644 index 000000000..67b5e6a75 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/main.tf @@ -0,0 +1,209 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + _filestore_host_mount = "/tmp/cloud/filestore_mnt" + _gcsfuse_host_mount = "/tmp/cloud/gcsfuse_mnt" + + _startup_scripts_template_variables = { + install_gpu = var.enable_install_gpu + script = var.startup_script != null ? replace(var.startup_script, "\n", "\n ") : "" + } + + _network_storage_template_variables = { + filestore_mount_commands = join( + " && ", + concat( + ["true"], // dummy to make join return non-null + [ + for f in var.filestores + : "mount -t nfs -o async,hard,rw ${f.remote_mount} ${local._filestore_host_mount}${f.local_mount}" + ], + ["true"], // dummy to make join return non-null + ) + ) + gcsfuse_host_mount = local._gcsfuse_host_mount + gcsfuse_mount_commands = join( + " && ", + concat( + ["true"], // dummy to make join return non-null + [ + for g in var.gcsfuses + : "docker exec gcsfuse gcsfuse --implicit-dirs ${g.remote_mount} ${local._gcsfuse_host_mount}${g.local_mount}" + ], + ["true"], // dummy to make join return non-null + ) + ) + host_mountpoints = join( + " ", + concat( + [ + for m in var.filestores[*].local_mount + : "${local._filestore_host_mount}${m}" + ], + [ + for m in var.gcsfuses[*].local_mount + : "${local._gcsfuse_host_mount}${m}" + ], + ["."], // dummy to make join return non-null and have mkdir succeed + ), + ) + } + + _container = { + cmd = try(var.container.cmd != null ? var.container.cmd : "", "") + image = try(var.container.image != null ? var.container.image : "", "") + run_at_boot = try( + var.container.run_at_boot != null ? var.container.run_at_boot : true, + false, + ) + run_options = { + custom = try( + var.container.run_options.custom != null ? ( + var.container.run_options.custom + ) : [], + [], + ) + enable_cloud_logging = try( + var.container.run_options.enable_cloud_logging != null ? ( + var.container.run_options.enable_cloud_logging + ) : false, + false, + ) + env = try( + var.container.run_options.env != null ? ( + var.container.run_options.env + ) : {}, + {} + ) + } + } + + _container_template_variables = { + docker_cmd = local._container.cmd + docker_image = local._container.image + docker_run_options = join( + " ", + concat( + [ + for name, value in local._container.run_options.env + : "--env ${name}=${value}" + ], + local._container.run_options.enable_cloud_logging ? [ + "--log-driver=gcplogs" + ] : [], + local._container.run_options.custom, + [""], // dummy to make join return non-null + ), + ) + docker_volume_flags = join( + " ", + concat( + [ + for m in var.filestores[*].local_mount + : "--volume ${local._filestore_host_mount}${m}:${m}:rw" + ], + [ + for m in var.gcsfuses[*].local_mount + : "--volume ${local._gcsfuse_host_mount}${m}:${m}:rw,rslave" + ], + [""], // dummy to make join return non-null + ), + ) + docker_device_flags = join( + " ", + [ + "--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64", + "--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin", + "--device /dev/nvidia-uvm:/dev/nvidia-uvm", + "--device /dev/nvidiactl:/dev/nvidiactl", + "$${device_flags}", + ], + ) + requirements = join( + " ", + concat( + [ + "aiinfra-network-storage.service", + "aiinfra-pull-image.service", + "aiinfra-startup-scripts.service", + ], + ) + ) + } + + _userdata_template_variables = merge( + { + startup_scripts = { + file = templatefile( + "${path.module}/templates/aiinfra_startup_scripts.yaml.template", + local._startup_scripts_template_variables, + ) + service = "aiinfra-startup-scripts" + } + network_storage = { file = "", service = null, } + pull_image = { file = "", service = null, } + start_container = { file = "", service = null, } + }, + var.container != null ? { + network_storage = { + file = templatefile( + "${path.module}/templates/aiinfra_network_storage.yaml.template", + local._network_storage_template_variables, + ) + service = "aiinfra-network-storage" + } + pull_image = { + file = templatefile( + "${path.module}/templates/aiinfra_pull_image.yaml.template", + local._container_template_variables, + ) + service = "aiinfra-pull-image" + } + start_container = { + file = templatefile( + "${path.module}/templates/aiinfra_start_container.yaml.template", + local._container_template_variables, + ) + service = local._container.run_at_boot ? "aiinfra-start-container" : null + } + } : {}, + ) + userdata_template_variables = { + aiinfra_network_storage = local._userdata_template_variables.network_storage.file + aiinfra_startup_scripts = local._userdata_template_variables.startup_scripts.file + aiinfra_pull_image = local._userdata_template_variables.pull_image.file + aiinfra_start_container = local._userdata_template_variables.start_container.file + aiinfra_services = join( + " ", + [for k, v in local._userdata_template_variables : v.service if v.service != null], + ) + } +} + +data "cloudinit_config" "config" { + gzip = false + base64_encode = false + + part { + content_type = "text/cloud-config" + content = templatefile( + "${path.module}/templates/userdata.yaml.template", + local.userdata_template_variables, + ) + filename = "userdata.yaml" + } +} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/outputs.tf b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/outputs.tf new file mode 100644 index 000000000..6644961d5 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/outputs.tf @@ -0,0 +1,19 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +output "user-data" { + value = data.cloudinit_config.config.rendered +} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_network_storage.yaml.template b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_network_storage.yaml.template new file mode 100644 index 000000000..1aeb5c81f --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_network_storage.yaml.template @@ -0,0 +1,52 @@ +- path: /etc/systemd/system/aiinfra/gcsfuse/Dockerfile + permissions: 0644 + owner: root + content: | + FROM debian:bullseye-slim + RUN apt-get update -qq \ + && apt-get install -y curl gnupg \ + && echo "deb http://packages.cloud.google.com/apt gcsfuse-xenial main" >/etc/apt/sources.list.d/gcsfuse.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor >/etc/apt/trusted.gpg.d/apt-key.gpg \ + && apt-get update -qq \ + && apt-get install -y gcsfuse \ + && rm -rf /var/lib/{apt,dpkg,cache,log}/ + CMD ["sleep", "infinity"] +- path: /etc/systemd/system/aiinfra/network_storage.sh + permissions: 0755 + owner: root + content: | + #!/bin/sh + [ "${host_mountpoints}" != '.' ] || exit 0 + mkdir -p ${host_mountpoints} + mount_filestores () { + [ "${filestore_mount_commands}" != 'true && true' ] || return 0 + ${filestore_mount_commands} + } + mount_gcsfuses () { + [ "${gcsfuse_mount_commands}" != 'true && true' ] || return 0 + docker build --tag gcsfuse /etc/systemd/system/aiinfra/gcsfuse \ + && docker run --detach --rm \ + --privileged \ + --volume ${gcsfuse_host_mount}:${gcsfuse_host_mount}:rw,rshared \ + --name gcsfuse \ + gcsfuse \ + && ${gcsfuse_mount_commands} + } + mount_filestores && mount_gcsfuses +- path: /etc/systemd/system/aiinfra-network-storage.service + permissions: 0644 + owner: root + content: | + [Install] + WantedBy=aiinfra.target + [Unit] + After=gcr-online.target docker.socket + Description=Set up network storage + Requires=gcr-online.target docker.socket + [Service] + ExecStart=/etc/systemd/system/aiinfra/network_storage.sh + RemainAfterExit=true + StandardOutput=journal+console + StandardError=journal+console + Type=oneshot + User=root diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_pull_image.yaml.template b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_pull_image.yaml.template new file mode 100644 index 000000000..d33e6ce31 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_pull_image.yaml.template @@ -0,0 +1,27 @@ +- path: /etc/systemd/system/aiinfra/pull_image.sh + permissions: 0755 + owner: root + content: | + #!/bin/sh + mount -t tmpfs tmpfs /root + docker-credential-gcr configure-docker + docker_image='${docker_image}' + docker-credential-gcr configure-docker --registries "$${docker_image%%/*}" + docker pull "$${docker_image}" +- path: /etc/systemd/system/aiinfra-pull-image.service + permissions: 0644 + owner: root + content: | + [Install] + WantedBy=aiinfra.target + [Unit] + After=gcr-online.target docker.socket + Description=Pull a docker image + Requires=gcr-online.target docker.socket + [Service] + ExecStart=/etc/systemd/system/aiinfra/pull_image.sh + RemainAfterExit=true + StandardError=journal+console + StandardOutput=journal+console + Type=oneshot + User=root diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_start_container.yaml.template b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_start_container.yaml.template new file mode 100644 index 000000000..9b66ad440 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_start_container.yaml.template @@ -0,0 +1,37 @@ +- path: /etc/systemd/system/aiinfra/start_container.sh + permissions: 0755 + owner: root + content: | + #!/bin/sh + device_flags=$(find /dev -type c -regex '\/dev\/nvidia[0-9]*' -printf '--device %p:%p ') + if ! docker container ls -a | grep -q aiinfra; then + docker run \ + --detach \ + --hostname $(hostname) \ + --ipc host \ + --name aiinfra \ + --network host \ + --privileged \ + --restart always \ + ${docker_device_flags} \ + ${docker_volume_flags} \ + ${docker_run_options} \ + ${docker_image} ${docker_cmd} + fi +- path: /etc/systemd/system/aiinfra-start-container.service + permissions: 0644 + owner: root + content: | + [Install] + WantedBy=aiinfra.target + [Unit] + After=${requirements} + Description=Run a docker container + Requires=${requirements} + [Service] + ExecStart=/etc/systemd/system/aiinfra/start_container.sh + RemainAfterExit=true + StandardError=journal+console + StandardOutput=journal+console + Type=oneshot + User=root diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template new file mode 100644 index 000000000..0897b7df0 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template @@ -0,0 +1,76 @@ +- path: /etc/systemd/system/aiinfra/startup_scripts.sh + permissions: 0755 + owner: root + content: | + #!/bin/sh -ex + if ${install_gpu}; then + /etc/systemd/system/aiinfra/startup_script_install_gpu.sh + fi + /etc/systemd/system/aiinfra/startup_script_custom.sh + +- path: /etc/systemd/system/aiinfra/startup_script_install_gpu.sh + permissions: 0755 + owner: root + content: | + #!/bin/sh -ex + + echo 'Installing GPU drivers' + cos-extensions install gpu -- --version=latest + mount --bind /var/lib/nvidia /var/lib/nvidia + mount -o remount,exec /var/lib/nvidia + /var/lib/nvidia/bin/nvidia-smi -pm 1 + + echo 'Configuring docker auth...' + mount -t tmpfs tmpfs /root + docker-credential-gcr configure-docker + docker-credential-gcr configure-docker --registries us-docker.pkg.dev + + echo 'Configuring the Receive Data Path Manager...' + device_flags=$(find /dev -type c -regex '\/dev\/nvidia[0-9]*' -printf '--device %p:%p ') + docker run --pull=always --rm \ + --name receive-datapath-manager \ + --detach \ + --privileged \ + --cap-add=NET_ADMIN --network=host \ + --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ + $${device_flags} \ + --device /dev/nvidia-uvm:/dev/nvidia-uvm \ + --device /dev/nvidiactl:/dev/nvidiactl \ + --env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \ + --volume /run/tcpx:/run/tcpx \ + --entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \ + us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.7 \ + --gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" + + echo 'Installing iptable rules...' + /sbin/iptables -I INPUT -p tcp -m tcp -j ACCEPT + + echo 'Configuring NCCL and GPUDirectTCPX plugin...' + docker run --rm \ + --volume /var/lib:/var/lib \ + us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06 \ + install --install-nccl + mount --bind /var/lib/tcpx /var/lib/tcpx + mount -o remount,exec /var/lib/tcpx + +- path: /etc/systemd/system/aiinfra/startup_script_custom.sh + permissions: 0755 + owner: root + content: | + ${script} + +- path: /etc/systemd/system/aiinfra-startup-scripts.service + permissions: 0644 + owner: root + content: | + [Install] + WantedBy=aiinfra.target + [Unit] + Description=Startup scripts + [Service] + ExecStart=/etc/systemd/system/aiinfra/startup_scripts.sh + RemainAfterExit=true + StandardOutput=journal+console + StandardError=journal+console + Type=oneshot + User=root diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/userdata.yaml.template b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/userdata.yaml.template new file mode 100644 index 000000000..78b4c8502 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/templates/userdata.yaml.template @@ -0,0 +1,18 @@ +# cloud-config + +write_files: +- path: /etc/systemd/system/aiinfra.target + permissions: 0644 + content: | + [Unit] + Description=aiinfra target + After=cloud-init.target +${aiinfra_network_storage} +${aiinfra_startup_scripts} +${aiinfra_pull_image} +${aiinfra_start_container} + +runcmd: +- systemctl daemon-reload +- systemctl enable ${aiinfra_services} +- systemctl start ${aiinfra_services} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/variables.tf b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/variables.tf new file mode 100644 index 000000000..27ac6c785 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/cloudinit/variables.tf @@ -0,0 +1,97 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "container" { + type = object({ + image = string + cmd = string + run_at_boot = bool + run_options = object({ + custom = list(string) + enable_cloud_logging = bool + env = map(string) + }) + }) + + validation { + condition = var.container != null ? alltrue( + [for empty in [null, ""] : var.container.image != empty] + ) : true + error_message = "must have non-empty image" + } +} + +variable "enable_install_gpu" { + type = bool + + validation { + condition = var.enable_install_gpu != null + error_message = "must not be null" + } +} + +variable "filestores" { + type = list(object({ + local_mount = string + remote_mount = string + })) + + validation { + condition = var.filestores != null + error_message = "must not be null" + } + + validation { + condition = alltrue([ + for f in var.filestores + : alltrue([ + for empty in [null, ""] + : f.local_mount != empty && f.remote_mount != empty + ]) + ]) + error_message = "local_mount and remote_mount must not be null" + } +} + +variable "gcsfuses" { + type = list(object({ + local_mount = string + remote_mount = string + })) + + validation { + condition = var.gcsfuses != null + error_message = "must not be null" + } + + validation { + condition = try( + alltrue([ + for g in var.gcsfuses + : alltrue([ + for empty in [null, ""] + : g.local_mount != empty && g.remote_mount != empty + ]) + ]), + true + ) + error_message = "local_mount and remote_mount must not be null" + } +} + +variable "startup_script" { + type = string +} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/main.tf b/a3-mega/terraform/modules/cluster/mig-cos/main.tf new file mode 100644 index 000000000..701581b38 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/main.tf @@ -0,0 +1,101 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + metadata = merge( + { + user-data = module.cloudinit.user-data + google-logging-use-fluentbit = "true" + google-logging-enabled = "true" + }, + var.metadata != null ? var.metadata : {}, + ) +} + +module "network" { + source = "../../common/network" + + nic0_existing = var.network_existing + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix +} + +module "filestore" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=v1.17.0" + count = length(var.filestore_new) + + deployment_name = var.resource_prefix + filestore_share_name = "nfsshare_${count.index}" + filestore_tier = var.filestore_new[count.index].filestore_tier + local_mount = var.filestore_new[count.index].local_mount + network_id = module.network.network_ids[0] + project_id = var.project_id + region = var.region + size_gb = var.filestore_new[count.index].size_gb + zone = var.filestore_new[count.index].zone + labels = merge(var.labels, { ghpc_role = "file-system" }) +} + +module "cloudinit" { + source = "./cloudinit" + + container = var.container + enable_install_gpu = var.enable_install_gpu + filestores = [ + for n in module.filestore[*].network_storage + : { + local_mount = n.local_mount + remote_mount = "${n.server_ip}:${n.remote_mount}" + } + ] + gcsfuses = var.gcsfuse_existing != null ? var.gcsfuse_existing : [] + startup_script = var.startup_script +} + +module "compute_instance_template" { + source = "../../common/instance_template" + count = length(var.instance_groups) + + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + machine_image = var.machine_image + machine_type = var.instance_groups[count.index].machine_type + maintenance_interval = var.maintenance_interval + metadata = local.metadata + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix + service_account = var.service_account + use_compact_placement_policy = var.use_compact_placement_policy + existing_resource_policy_name = var.instance_groups[count.index].existing_resource_policy_name + startup_script = null + subnetwork_self_links = module.network.subnetwork_self_links + network_self_links = module.network.network_self_links + labels = merge(var.labels, { ghpc_role = "compute" }) +} + +module "compute_instance_group_manager" { + source = "../../common/instance_group_manager" + count = length(var.instance_groups) + + project_id = var.project_id + resource_prefix = "${var.resource_prefix}-${count.index}" + zone = var.instance_groups[count.index].zone + instance_template_id = module.compute_instance_template[count.index].id + target_size = var.instance_groups[count.index].target_size + wait_for_instances = var.wait_for_instances +} diff --git a/a3-mega/terraform/modules/cluster/mig-cos/variables.tf b/a3-mega/terraform/modules/cluster/mig-cos/variables.tf new file mode 100644 index 000000000..8af2c7622 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig-cos/variables.tf @@ -0,0 +1,325 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "instance_groups" { + description = <<-EOT + Required Fields: + - `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size). + - `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone). + - `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). + - `existing_resource_policy_name`: (Optional) The existing resource policy. + EOT + type = list(object({ + zone = string + target_size = number + machine_type = optional(string, "a3-highgpu-8g") + existing_resource_policy_name = optional(string, null) + })) + nullable = false + + validation { + condition = length(var.instance_groups) != 0 + error_message = "must have at least one instance group" + } + + validation { + condition = alltrue([ + for g in var.instance_groups : g.zone != null && g.target_size != null + ]) + error_message = "zone and target_size must not be null" + } +} + +variable "project_id" { + description = "GCP Project ID to which the cluster will be deployed." + type = string + nullable = false +} + +variable "region" { + description = "The region in which all instances will reside." + type = string + nullable = false +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string + nullable = false +} + +variable "container" { + description = <<-EOT + Container image to start on boot on each instance. All `local_mount`s found in `filestore_new` and `gcsfuse_existing` will be visible within the container. + + Attributes: + - `image`: docker image which will get pulled and started at boot on each instance (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/build/#tag)). + - `cmd`: arguments to the entrypoint of the docker image (Related docs: [docker](https://docs.docker.com/engine/reference/builder/#cmd)). Defaults to `[]`. + - `run_at_boot`: automatically start a container on each instance when they are created (will still pull image at boot when set to `false`). Defaults to `true`. + - `run_options`: the additional options to pass during docker run. + - `custom`: any other `docker run` options (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/run/#options)). `docker run` flags already added (`container.run_options.custom` will be appended to this list): `--detach --hostname $(hostname) --ipc host --name aiinfra --network host --privileged --restart always`. Defaults to `[]`. + - `enable_cloud_logging`: the flag to enable GCP cloud logging (`--log-driver=gcplogs`) for the containers (Related docs: [docker](https://cloud.google.com/community/tutorials/docker-gcplogs-driver)). Defaults to `false`. + - `env`: environment variables for the docker container (Related docs: [docker](https://docs.docker.com/engine/reference/commandline/run/#env)). Defaults to `{}`. + + EOT + type = object({ + image = string + cmd = string + run_at_boot = bool + run_options = object({ + custom = list(string) + enable_cloud_logging = bool + env = map(string) + }) + }) + default = null +} + +variable "enable_install_gpu" { + description = <<-EOT + Setting this to false will disable a built-in startup script which: + - installs GPU drivers + - configures docker auth + - installs iptable rules + - installs NCCL and GPUDirectTCPX plugin + + Any installation replacements should be in the startup_script variable + EOT + type = bool + default = true + nullable = false +} + +variable "disk_size_gb" { + description = <<-EOT + The size of the image in gigabytes for the boot disk of each instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). + EOT + type = number + default = 128 +} + +variable "disk_type" { + description = <<-EOT + The GCE disk type for the boot disk of each instance. + + Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]` + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). + EOT + type = string + default = "pd-ssd" +} + +variable "filestore_new" { + description = <<-EOT + Configurations to mount newly created network storage. Each object describes NFS file-servers to be hosted in Filestore. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#inputs). + + ------------ + `filestore_new.filestore_tier` + + The service tier of the instance. + + Possible values: `["BASIC_HDD", "BASIC_SSD", "HIGH_SCALE_SSD", "ENTERPRISE"]`. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_filestore_tier), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--tier). + + ------------ + `filestore_new.local_mount` + + Mountpoint for this filestore instance. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount). + + ------------ + `filestore_new.size_gb` + + Storage size of the filestore instance in GB. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--file-share). + + ------------ + `filestore_new.zone` + + Location for filestore instance. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_zone). + EOT + type = list(object({ + filestore_tier = string + local_mount = string + size_gb = number + zone = string + })) + default = [] +} + +variable "gcsfuse_existing" { + description = <<-EOT + Configurations to mount existing network storage. Each object describes Cloud Storage Buckets to be mounted with Cloud Storage FUSE. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#inputs). + + ------------ + `gcsfuse_existing.local_mount` + + The mount point where the contents of the device may be accessed after mounting. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_local_mount). + + ------------ + `gcsfuse_existing.remote_mount` + + Bucket name without “gs://”. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_remote_mount). + EOT + type = list(object({ + local_mount = string + remote_mount = string + })) + default = [] +} + +variable "labels" { + description = <<-EOT + The resource labels (a map of key/value pairs) to be applied to the GPU cluster. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). + EOT + type = map(string) + default = {} +} + +variable "machine_image" { + description = <<-EOT + The image with which this disk will initialize. This image must be in the project `cos-cloud`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image). + + ------------ + `machine_image.family` + + The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family). + + ------------ + `machine_image.name` + + The name of a specific image. Conflicts with `machine_image.family`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image). + + ------------ + `machine_image.project` + + The project_id to which this image belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). + EOT + type = object({ + family = string + name = string + project = string + }) + default = { + family = "cos-stable" + name = null + project = "cos-cloud" + } +} + +variable "maintenance_interval" { + description = <<-EOT + Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for maintenance_interval. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#maintenance_interval). + EOT + type = string + default = null +} + +variable "metadata" { + description = <<-EOT + GCE metadata to attach to each instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). + EOT + type = map(string) + default = {} +} + +variable "network_existing" { + description = "Existing network to attach to nic0. Setting to null will create a new network for it." + type = object({ + network_name = string + subnetwork_name = string + }) + default = null +} + +variable "service_account" { + description = <<-EOT + Service account to attach to the instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account). + + ------------ + `service_account.email` + + The service account e-mail address. If not given, the default Google Compute Engine service account is used. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account). + + ------------ + `service_account.scopes` + + A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes). + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). + EOT + type = object({ + email = string, + scopes = set(string) + }) + default = null +} + +variable "startup_script" { + description = "Shell script -- the actual script (not the filename)." + type = string + default = null +} + +variable "use_compact_placement_policy" { + description = "The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy." + type = bool + default = false +} + +variable "wait_for_instances" { + description = <<-EOT + Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). + EOT + type = bool + default = true +} diff --git a/a3-mega/terraform/modules/cluster/mig/README.md b/a3-mega/terraform/modules/cluster/mig/README.md new file mode 100644 index 000000000..9940c196f --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig/README.md @@ -0,0 +1,70 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [compute\_instance\_group\_manager](#module\_compute\_instance\_group\_manager) | ../../common/instance_group_manager | n/a | +| [compute\_instance\_template](#module\_compute\_instance\_template) | ../../common/instance_template | n/a | +| [dashboard](#module\_dashboard) | ../../common/dashboard | n/a | +| [filestore](#module\_filestore) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore// | v1.17.0 | +| [gcsfuse](#module\_gcsfuse) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/pre-existing-network-storage// | v1.17.0 | +| [network](#module\_network) | ../../common/network | n/a | +| [startup](#module\_startup) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/ | v1.17.0 | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [disk\_size\_gb](#input\_disk\_size\_gb) | The size of the image in gigabytes for the boot disk of each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). | `number` | `128` | no | +| [disk\_type](#input\_disk\_type) | The GCE disk type for the boot disk of each instance.

Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). | `string` | `"pd-ssd"` | no | +| [enable\_ops\_agent](#input\_enable\_ops\_agent) | Install [Google Cloud Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent). | `bool` | `true` | no | +| [enable\_ray](#input\_enable\_ray) | Install [Ray](https://docs.ray.io/en/latest/cluster/getting-started.html). | `bool` | `false` | no | +| [filestore\_new](#input\_filestore\_new) | Configurations to mount newly created network storage. Each object describes NFS file-servers to be hosted in Filestore.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#inputs).

------------
`filestore_new.filestore_tier`

The service tier of the instance.

Possible values: `["BASIC_HDD", "BASIC_SSD", "HIGH_SCALE_SSD", "ENTERPRISE"]`.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_filestore_tier), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--tier).

------------
`filestore_new.local_mount`

Mountpoint for this filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount).

------------
`filestore_new.size_gb`

Storage size of the filestore instance in GB.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--file-share).
-
`filestore_new.zone`

Location for filestore instance.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/f |
list(object({
filestore_tier = string
local_mount = string
size_gb = number
zone = string
}))
| `[]` | no | +| [gcsfuse\_existing](#input\_gcsfuse\_existing) | Configurations to mount existing network storage. Each object describes Cloud Storage Buckets to be mounted with Cloud Storage FUSE.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#inputs).

------------
`gcsfuse_existing.local_mount`

The mount point where the contents of the device may be accessed after mounting.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_local_mount).

------------
`gcsfuse_existing.remote_mount`

Bucket name without “gs://”.

Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_remote_mount). |
list(object({
local_mount = string
remote_mount = string
}))
| `[]` | no | +| [instance\_groups](#input\_instance\_groups) | Required Fields:
- `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size).
- `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone).
- `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type).
- `existing_resource_policy_name`: (Optional) The existing resource policy. |
list(object({
zone = string
target_size = number
machine_type = optional(string, "a3-highgpu-8g")
existing_resource_policy_name = optional(string, null)
}))
| n/a | yes | +| [labels](#input\_labels) | The resource labels (a map of key/value pairs) to be applied to the GPU cluster.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). | `map(string)` | `{}` | no | +| [machine\_image](#input\_machine\_image) | The image with which this disk will initialize.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image).

------------
`machine_image.family`

The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family).

------------
`machine_image.name`

The name of a specific image. Conflicts with `machine_image.family`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image).

------------
`machine_image.project`

The project\_id to which this image belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). |
object({
family = string
name = string
project = string
})
|
{
"family": "pytorch-latest-gpu-debian-11-py310",
"name": null,
"project": "deeplearning-platform-release"
}
| no | +| [metadata](#input\_metadata) | GCE metadata to attach to each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). | `map(string)` | `{}` | no | +| [network\_existing](#input\_network\_existing) | Existing network to attach to nic0. Setting to null will create a new network for it. |
object({
network_name = string
subnetwork_name = string
})
| `null` | no | +| [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes | +| [region](#input\_region) | The region in which all instances will reside. | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | +| [service\_account](#input\_service\_account) | Service account to attach to the instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account).

------------
`service_account.email`

The service account e-mail address. If not given, the default Google Compute Engine service account is used.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account).

------------
`service_account.scopes`

A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [startup\_script](#input\_startup\_script) | Shell script -- the actual script (not the filename). | `string` | `null` | no | +| [startup\_script\_file](#input\_startup\_script\_file) | The full path in the VM to the shell script to be executed at VM startup. | `string` | `null` | no | +| [startup\_script\_gcs\_bucket\_path](#input\_startup\_script\_gcs\_bucket\_path) | The storage bucket full path to be used for storing the startup script.
Example: `gs://bucketName/dirName`

If the value is not provided, then a default storage bucket will be created for the script execution.
`storage.buckets.create` IAM permission is needed for creating the default storage bucket. | `string` | `null` | no | +| [use\_compact\_placement\_policy](#input\_use\_compact\_placement\_policy) | The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy. | `bool` | `false` | no | +| [wait\_for\_instances](#input\_wait\_for\_instances) | Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). | `bool` | `true` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [instructions](#output\_instructions) | Instructions for accessing the dashboard | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/cluster/mig/main.tf b/a3-mega/terraform/modules/cluster/mig/main.tf new file mode 100644 index 000000000..096f3a602 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig/main.tf @@ -0,0 +1,138 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + startup_runners = concat( + var.enable_ops_agent ? [{ + type = "shell" + destination = "/tmp/enable_ops_agent.sh" + source = "${path.module}/../../../../../scripts/enable_ops_agent.sh" + }] : [], + var.enable_ray ? [{ + type = "shell" + destination = "/tmp/enable_ray.sh" + source = "${path.module}/../../../../../scripts/enable_ray.sh" + args = "1.12.1 26379 8" + }] : [], + var.startup_script != null && var.startup_script != "" ? [{ + type = "shell" + destination = "/tmp/startup_script.sh" + content = var.startup_script + }] : [], + var.startup_script_file != null && var.startup_script_file != "" ? [{ + type = "shell" + destination = "/tmp/startup_script_file.sh" + source = var.startup_script_file + }] : [], + ) +} + +module "dashboard" { + source = "../../common/dashboard" + count = var.enable_ops_agent ? 1 : 0 + + enable_gce_gke_gpu_utilization_widgets = false + enable_nvidia_dcgm_widgets = true + enable_nvidia_nvml_widgets = true + project_id = var.project_id + resource_prefix = var.resource_prefix +} + +module "network" { + source = "../../common/network" + + nic0_existing = var.network_existing + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix +} + +module "gcsfuse" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/pre-existing-network-storage//?ref=v1.17.0" + count = length(var.gcsfuse_existing) + + fs_type = "gcsfuse" + local_mount = var.gcsfuse_existing[count.index].local_mount + mount_options = "defaults,_netdev,implicit_dirs,allow_other" + remote_mount = var.gcsfuse_existing[count.index].remote_mount +} + +module "filestore" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=v1.17.0" + count = length(var.filestore_new) + + deployment_name = var.resource_prefix + filestore_share_name = "nfsshare_${count.index}" + filestore_tier = var.filestore_new[count.index].filestore_tier + local_mount = var.filestore_new[count.index].local_mount + network_id = module.network.network_ids[0] + project_id = var.project_id + region = var.region + size_gb = var.filestore_new[count.index].size_gb + zone = var.filestore_new[count.index].zone + labels = merge(var.labels, { ghpc_role = "file-system" }) +} + +module "startup" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script/?ref=v1.17.0" + + deployment_name = var.resource_prefix + labels = merge(var.labels, { ghpc_role = "scripts" }) + project_id = var.project_id + region = var.region + gcs_bucket_path = var.startup_script_gcs_bucket_path + runners = concat( + module.gcsfuse[*].client_install_runner, + module.gcsfuse[*].mount_runner, + module.filestore[*].install_nfs_client_runner, + module.filestore[*].mount_runner, + local.startup_runners, + ) +} + +module "compute_instance_template" { + source = "../../common/instance_template" + count = length(var.instance_groups) + + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + machine_image = var.machine_image + machine_type = var.instance_groups[count.index].machine_type + maintenance_interval = null + metadata = var.metadata + project_id = var.project_id + region = var.region + resource_prefix = var.resource_prefix + service_account = var.service_account + use_compact_placement_policy = var.use_compact_placement_policy + existing_resource_policy_name = var.instance_groups[count.index].existing_resource_policy_name + startup_script = module.startup.startup_script + subnetwork_self_links = module.network.subnetwork_self_links + network_self_links = module.network.network_self_links + labels = merge(var.labels, { ghpc_role = "compute" }) +} + +module "compute_instance_group_manager" { + source = "../../common/instance_group_manager" + count = length(var.instance_groups) + + project_id = var.project_id + resource_prefix = "${var.resource_prefix}-${count.index}" + zone = var.instance_groups[count.index].zone + instance_template_id = module.compute_instance_template[count.index].id + target_size = var.instance_groups[count.index].target_size + wait_for_instances = var.wait_for_instances +} diff --git a/a3-mega/terraform/modules/cluster/mig/outputs.tf b/a3-mega/terraform/modules/cluster/mig/outputs.tf new file mode 100644 index 000000000..817c039c5 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig/outputs.tf @@ -0,0 +1,20 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "instructions" { + description = "Instructions for accessing the dashboard" + value = try(module.dashboard[0].instructions, "Dashboard not created") +} diff --git a/a3-mega/terraform/modules/cluster/mig/variables.tf b/a3-mega/terraform/modules/cluster/mig/variables.tf new file mode 100644 index 000000000..016720175 --- /dev/null +++ b/a3-mega/terraform/modules/cluster/mig/variables.tf @@ -0,0 +1,312 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "instance_groups" { + description = <<-EOT + Required Fields: + - `target_size`: The number of running instances for this managed instance group. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size). + - `zone`: The zone that instances in this group should be created in. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone). + - `machine_type`: (Optional)The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). + - `existing_resource_policy_name`: (Optional) The existing resource policy. + EOT + type = list(object({ + zone = string + target_size = number + machine_type = optional(string, "a3-megagpu-8g") + existing_resource_policy_name = optional(string, null) + })) + nullable = false + + validation { + condition = length(var.instance_groups) != 0 + error_message = "must have at least one instance group" + } + + validation { + condition = alltrue([ + for g in var.instance_groups : g.zone != null && g.target_size != null + ]) + error_message = "zone and target_size must not be null" + } +} + +variable "project_id" { + description = "GCP Project ID to which the cluster will be deployed." + type = string +} + +variable "region" { + description = "The region in which all instances will reside." + type = string + nullable = false +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string +} + +variable "disk_size_gb" { + description = <<-EOT + The size of the image in gigabytes for the boot disk of each instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). + EOT + type = number + default = 128 +} + +variable "disk_type" { + description = <<-EOT + The GCE disk type for the boot disk of each instance. + + Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]` + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). + EOT + type = string + default = "pd-ssd" +} + +variable "filestore_new" { + description = <<-EOT + Configurations to mount newly created network storage. Each object describes NFS file-servers to be hosted in Filestore. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#inputs). + + ------------ + `filestore_new.filestore_tier` + + The service tier of the instance. + + Possible values: `["BASIC_HDD", "BASIC_SSD", "HIGH_SCALE_SSD", "ENTERPRISE"]`. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_filestore_tier), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--tier). + + ------------ + `filestore_new.local_mount` + + Mountpoint for this filestore instance. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount). + + ------------ + `filestore_new.size_gb` + + Storage size of the filestore instance in GB. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/filestore#input_local_mount), [gcloud](https://cloud.google.com/sdk/gcloud/reference/filestore/instances/create#--file-share). +- + `filestore_new.zone` + + Location for filestore instance. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/f + EOT + type = list(object({ + filestore_tier = string + local_mount = string + size_gb = number + zone = string + })) + default = [] +} + +variable "gcsfuse_existing" { + description = <<-EOT + Configurations to mount existing network storage. Each object describes Cloud Storage Buckets to be mounted with Cloud Storage FUSE. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#inputs). + + ------------ + `gcsfuse_existing.local_mount` + + The mount point where the contents of the device may be accessed after mounting. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_local_mount). + + ------------ + `gcsfuse_existing.remote_mount` + + Bucket name without “gs://”. + + Related docs: [hpc-toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/file-system/pre-existing-network-storage#input_remote_mount). + EOT + type = list(object({ + local_mount = string + remote_mount = string + })) + default = [] +} + +variable "enable_ops_agent" { + description = <<-EOT + Install [Google Cloud Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent). + EOT + type = bool + default = true + + validation { + condition = var.enable_ops_agent != null + error_message = "must not be null" + } +} + +variable "enable_ray" { + description = "Install [Ray](https://docs.ray.io/en/latest/cluster/getting-started.html)." + type = bool + default = false + + validation { + condition = var.enable_ray != null + error_message = "must not be null" + } +} + +variable "labels" { + description = <<-EOT + The resource labels (a map of key/value pairs) to be applied to the GPU cluster. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). + EOT + type = map(string) + default = {} +} + +variable "machine_image" { + description = <<-EOT + The image with which this disk will initialize. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image). + + ------------ + `machine_image.family` + + The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family). + + ------------ + `machine_image.name` + + The name of a specific image. Conflicts with `machine_image.family`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image). + + ------------ + `machine_image.project` + + The project_id to which this image belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). + EOT + type = object({ + family = string + name = string + project = string + }) + default = { + project = "deeplearning-platform-release" + family = "pytorch-latest-gpu-debian-11-py310" + name = null + } +} + +variable "metadata" { + description = <<-EOT + GCE metadata to attach to each instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). + EOT + type = map(string) + default = {} +} + +variable "network_existing" { + description = "Existing network to attach to nic0. Setting to null will create a new network for it." + type = object({ + network_name = string + subnetwork_name = string + }) + default = null +} + +variable "service_account" { + description = <<-EOT + Service account to attach to the instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account). + + ------------ + `service_account.email` + + The service account e-mail address. If not given, the default Google Compute Engine service account is used. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account). + + ------------ + `service_account.scopes` + + A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes). + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). + EOT + type = object({ + email = string, + scopes = set(string) + }) + default = null +} + +variable "startup_script" { + description = "Shell script -- the actual script (not the filename)." + type = string + default = null +} + +variable "startup_script_file" { + description = "The full path in the VM to the shell script to be executed at VM startup." + type = string + default = null +} + +variable "startup_script_gcs_bucket_path" { + description = <<-EOT + The storage bucket full path to be used for storing the startup script. + Example: `gs://bucketName/dirName` + + If the value is not provided, then a default storage bucket will be created for the script execution. + `storage.buckets.create` IAM permission is needed for creating the default storage bucket. + EOT + type = string + default = null +} + +variable "use_compact_placement_policy" { + description = "The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy." + type = bool + default = false +} + +variable "wait_for_instances" { + description = <<-EOT + Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). + EOT + type = bool + default = true +} diff --git a/a3-mega/terraform/modules/common/dashboard/README.md b/a3-mega/terraform/modules/common/dashboard/README.md new file mode 100644 index 000000000..9077d2a7b --- /dev/null +++ b/a3-mega/terraform/modules/common/dashboard/README.md @@ -0,0 +1,55 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [http](#provider\_http) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [dashboard](#module\_dashboard) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/monitoring/dashboard/ | v1.17.0 | + +## Resources + +| Name | Type | +|------|------| +| [http_http.gce-gke-gpu-utilization](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | +| [http_http.nvidia-dcgm](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | +| [http_http.nvidia-nvml](https://registry.terraform.io/providers/hashicorp/http/latest/docs/data-sources/http) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_gce\_gke\_gpu\_utilization\_widgets](#input\_enable\_gce\_gke\_gpu\_utilization\_widgets) | Add GKE GPU utilization widgets to the dashboard. | `bool` | n/a | yes | +| [enable\_nvidia\_dcgm\_widgets](#input\_enable\_nvidia\_dcgm\_widgets) | Add Nvidia DCGM widgets to the dashboard. | `bool` | n/a | yes | +| [enable\_nvidia\_nvml\_widgets](#input\_enable\_nvidia\_nvml\_widgets) | Add Nvidia NVML widgets to the dashboard. | `bool` | n/a | yes | +| [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [instructions](#output\_instructions) | Instructions for accessing the dashboard | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/dashboard/main.tf b/a3-mega/terraform/modules/common/dashboard/main.tf new file mode 100644 index 000000000..ad2618536 --- /dev/null +++ b/a3-mega/terraform/modules/common/dashboard/main.tf @@ -0,0 +1,91 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + gce_gke_gpu_utilization_data = try( + jsondecode(tostring(data.http.gce-gke-gpu-utilization[0].response_body)), + null, + ) + nvidia_dcgm_data = try( + jsondecode(tostring(data.http.nvidia-dcgm[0].response_body)), + null, + ) + nvidia_nvml_data = try( + jsondecode(tostring(data.http.nvidia-nvml[0].response_body)), + null, + ) + + widgets = concat( + try( + [ + for tile in local.gce_gke_gpu_utilization_data.mosaicLayout.tiles + : jsonencode(tile.widget) + ], + [], + ), + try( + [ + for tile in local.nvidia_dcgm_data.mosaicLayout.tiles + : jsonencode(tile.widget) + ], + [], + ), + try( + [ + for tile in local.nvidia_nvml_data.mosaicLayout.tiles + : jsonencode(tile.widget) + ], + [], + ), + ) +} + +data "http" "gce-gke-gpu-utilization" { + url = "https://cloud-monitoring-dashboards.googleusercontent.com/samples/nvidia-gpu/gce-gke-gpu-utilization.json" + count = var.enable_gce_gke_gpu_utilization_widgets ? 1 : 0 + + request_headers = { + Accept = "application/json" + } +} + +data "http" "nvidia-dcgm" { + url = "https://cloud-monitoring-dashboards.googleusercontent.com/samples/nvidia-gpu/nvidia-dcgm.json" + count = var.enable_nvidia_dcgm_widgets ? 1 : 0 + + request_headers = { + Accept = "application/json" + } +} + +data "http" "nvidia-nvml" { + url = "https://cloud-monitoring-dashboards.googleusercontent.com/samples/nvidia-gpu/nvidia-nvml.json" + count = var.enable_nvidia_nvml_widgets ? 1 : 0 + + request_headers = { + Accept = "application/json" + } +} + +module "dashboard" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/monitoring/dashboard/?ref=v1.17.0" + + base_dashboard = "Empty" + deployment_name = var.resource_prefix + project_id = var.project_id + title = "AI Accelerator Experience Dashboard" + widgets = local.widgets +} diff --git a/a3-mega/terraform/modules/common/dashboard/outputs.tf b/a3-mega/terraform/modules/common/dashboard/outputs.tf new file mode 100644 index 000000000..ccdfb76ea --- /dev/null +++ b/a3-mega/terraform/modules/common/dashboard/outputs.tf @@ -0,0 +1,20 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "instructions" { + description = "Instructions for accessing the dashboard" + value = module.dashboard.instructions +} diff --git a/a3-mega/terraform/modules/common/dashboard/variables.tf b/a3-mega/terraform/modules/common/dashboard/variables.tf new file mode 100644 index 000000000..473a46bfd --- /dev/null +++ b/a3-mega/terraform/modules/common/dashboard/variables.tf @@ -0,0 +1,40 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "enable_gce_gke_gpu_utilization_widgets" { + description = "Add GKE GPU utilization widgets to the dashboard." + type = bool +} + +variable "enable_nvidia_dcgm_widgets" { + description = "Add Nvidia DCGM widgets to the dashboard." + type = bool +} + +variable "enable_nvidia_nvml_widgets" { + description = "Add Nvidia NVML widgets to the dashboard." + type = bool +} + +variable "project_id" { + description = "GCP Project ID to which the cluster will be deployed." + type = string +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string +} diff --git a/a3-mega/terraform/modules/common/instance_group_manager/README.md b/a3-mega/terraform/modules/common/instance_group_manager/README.md new file mode 100644 index 000000000..2b5a1a75d --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_group_manager/README.md @@ -0,0 +1,54 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [google-beta](#provider\_google-beta) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_compute_instance_group_manager.mig](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_instance_group_manager) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_auto\_config\_apply](#input\_enable\_auto\_config\_apply) | Whenever you update a MIG's instance\_template, Compute Engine automatically applies your updated configuration to new VMs that are added to the group.
This flag enables automatic application of an updated configuration to existing VMs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#nested_update_policy), [doc](https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups) | `bool` | `true` | no | +| [instance\_template\_id](#input\_instance\_template\_id) | The full URL to an instance template from which all new instances of this version will be created.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#instance_template). | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The ID of the project in which the resource belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#project). | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | +| [target\_size](#input\_target\_size) | The number of running instances for this managed instance group.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size). | `number` | n/a | yes | +| [wait\_for\_instances](#input\_wait\_for\_instances) | Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). | `bool` | `true` | no | +| [zone](#input\_zone) | The zone that instances in this group should be created in.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone). | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [id](#output\_id) | `id` output of the google\_compute\_instance\_group\_manager resource created. | +| [self\_link](#output\_self\_link) | `self_link` output of the google\_compute\_instance\_group\_manager resource created | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/instance_group_manager/main.tf b/a3-mega/terraform/modules/common/instance_group_manager/main.tf new file mode 100644 index 000000000..4a3772f28 --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_group_manager/main.tf @@ -0,0 +1,42 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ +resource "google_compute_instance_group_manager" "mig" { + provider = google-beta + + base_instance_name = var.resource_prefix + name = var.resource_prefix + project = var.project_id + target_size = var.target_size + wait_for_instances = var.wait_for_instances + zone = var.zone + + update_policy { + max_unavailable_fixed = 1 + minimal_action = "RESTART" + replacement_method = "RECREATE" # Instance name will be preserved + type = var.enable_auto_config_apply ? "PROACTIVE" : "OPPORTUNISTIC" + } + + version { + instance_template = var.instance_template_id + name = "default" + } + + timeouts { + create = "30m" + update = "30m" + } +} diff --git a/a3-mega/terraform/modules/common/instance_group_manager/outputs.tf b/a3-mega/terraform/modules/common/instance_group_manager/outputs.tf new file mode 100644 index 000000000..c5868e0b1 --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_group_manager/outputs.tf @@ -0,0 +1,25 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +output "id" { + description = "`id` output of the google_compute_instance_group_manager resource created." + value = resource.google_compute_instance_group_manager.mig.id +} + +output "self_link" { + description = "`self_link` output of the google_compute_instance_group_manager resource created" + value = resource.google_compute_instance_group_manager.mig.self_link +} diff --git a/a3-mega/terraform/modules/common/instance_group_manager/variables.tf b/a3-mega/terraform/modules/common/instance_group_manager/variables.tf new file mode 100644 index 000000000..a6e4ff2a3 --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_group_manager/variables.tf @@ -0,0 +1,102 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "enable_auto_config_apply" { + description = <<-EOT + Whenever you update a MIG's instance_template, Compute Engine automatically applies your updated configuration to new VMs that are added to the group. + This flag enables automatic application of an updated configuration to existing VMs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#nested_update_policy), [doc](https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups) + EOT + type = bool + default = true + + validation { + condition = var.enable_auto_config_apply != null + error_message = "must not be null" + } +} + +variable "instance_template_id" { + description = <<-EOT + The full URL to an instance template from which all new instances of this version will be created. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#instance_template). + EOT + type = string +} + +variable "project_id" { + description = <<-EOT + The ID of the project in which the resource belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#project). + EOT + type = string + + validation { + condition = var.project_id != null + error_message = "must not be null" + } +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string + + validation { + condition = var.resource_prefix != null + error_message = "must not be null" + } +} + +variable "target_size" { + description = <<-EOT + The number of running instances for this managed instance group. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#target_size), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--size). + EOT + type = number + + validation { + condition = var.target_size != null + error_message = "must not be null" + } +} + +variable "wait_for_instances" { + description = <<-EOT + Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_region_instance_group_manager#wait_for_instances). + EOT + type = bool + default = true + + validation { + condition = var.wait_for_instances != null + error_message = "must not be null" + } +} + +variable "zone" { + description = <<-EOT + The zone that instances in this group should be created in. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager#zone), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-groups/managed/create#--zone). + EOT + type = string +} diff --git a/a3-mega/terraform/modules/common/instance_template/README.md b/a3-mega/terraform/modules/common/instance_template/README.md new file mode 100644 index 000000000..e827b8039 --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_template/README.md @@ -0,0 +1,71 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | n/a | +| [google-beta](#provider\_google-beta) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [resource\_policy](#module\_resource\_policy) | ../resource_policy | n/a | + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_compute_instance_template.template](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_instance_template) | resource | +| [google_compute_default_service_account.account](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_compute_image.image](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [disk\_size\_gb](#input\_disk\_size\_gb) | The size of the image in gigabytes for the boot disk of each instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). | `number` | n/a | yes | +| [disk\_type](#input\_disk\_type) | The GCE disk type for the boot disk of each instance.

Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). | `string` | n/a | yes | +| [existing\_resource\_policy\_name](#input\_existing\_resource\_policy\_name) | The name of the existing resource policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). | `string` | `null` | no | +| [labels](#input\_labels) | A set of key/value label pairs to assign to instances created from this template.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). | `map(string)` | n/a | yes | +| [machine\_image](#input\_machine\_image) | The image with which this disk will initialize.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image).

------------
`machine_image.family`

The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family).

------------
`machine_image.name`

The name of a specific image. Conflicts with `machine_image.family`.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image).

------------
`machine_image.project`

The project\_id to which this image belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). |
object({
family = string
name = string
project = string
})
| n/a | yes | +| [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). | `string` | n/a | yes | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for maintenance\_interval.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#maintenance_interval). | `string` | n/a | yes | +| [metadata](#input\_metadata) | Metadata key/value pairs to make available from within instances created from this template.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). | `map(string)` | n/a | yes | +| [network\_self\_links](#input\_network\_self\_links) | The network self-links for all the VPCs. | `list(string)` | n/a | yes | +| [project\_id](#input\_project\_id) | The ID of the project in which the resource belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#project). | `string` | n/a | yes | +| [region](#input\_region) | An instance template is a global resource that is not bound to a zone or a region. However, you can still specify some regional resources in an instance template, which restricts the template to the region where that resource resides. For example, a custom subnetwork resource is tied to a specific region.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#region). | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | +| [service\_account](#input\_service\_account) | Service account to attach to the instance.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account).

------------
`service_account.email`

The service account e-mail address. If not given, the default Google Compute Engine service account is used.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account).

------------
`service_account.scopes`

A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes).

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). |
object({
email = string,
scopes = set(string)
})
| n/a | yes | +| [startup\_script](#input\_startup\_script) | Script to run at boot on each instance. This is here for convenience and will just be appended to `metadata` under the key `"startup-script"`. | `string` | n/a | yes | +| [subnetwork\_self\_links](#input\_subnetwork\_self\_links) | The subnet self-links for all the VPCs. | `list(string)` | n/a | yes | +| [use\_compact\_placement\_policy](#input\_use\_compact\_placement\_policy) | The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#resource_policies). | `bool` | `false` | no | +| [use\_static\_naming](#input\_use\_static\_naming) | Flag to determine whether to use static naming for instance\_template name. If used static naming, then instance\_template cannot be updated. it needs to be destroyed and then recreated.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#name_prefix). | `bool` | `false` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [id](#output\_id) | `id` output of the google\_compute\_instance\_template resource created. | +| [name](#output\_name) | `name` output of the google\_compute\_instance\_template resource created. | +| [self\_link](#output\_self\_link) | `self_link` output of the google\_compute\_instance\_template resource created | +| [service\_account](#output\_service\_account) | n/a | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/instance_template/main.tf b/a3-mega/terraform/modules/common/instance_template/main.tf new file mode 100644 index 000000000..578b90f9e --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_template/main.tf @@ -0,0 +1,144 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + machine_image = { + family = var.machine_image.family != "" ? var.machine_image.family : null + name = var.machine_image.name != "" ? var.machine_image.name : null + project = var.machine_image.project + } + + + _image_or_family = coalesce( + local.machine_image.family, + local.machine_image.name, + ) + nic_type = anytrue([ + for pattern in ["debian-11", "ubuntu", "gvnic", "cos"] + : length(regexall(pattern, local._image_or_family)) > 0 + ]) ? "GVNIC" : "VIRTIO_NET" + + + _machine_image_is_dlvm = contains( + [ + "deeplearning-platform-release", + "ml-images", + ], + local.machine_image.project + ) + metadata = merge( + { + VmDnsSetting = "ZonalPreferred" + install-nvidia-driver = "True" + enable-oslogin = "TRUE" + }, + local._machine_image_is_dlvm ? { + proxy-mode = "project_editors" + } : {}, + var.startup_script != null ? { + startup-script = var.startup_script + } : {}, + var.metadata != null ? var.metadata : {}, + ) + + + service_account = var.service_account != null ? var.service_account : { + email = data.google_compute_default_service_account.account.email + scopes = ["cloud-platform"] + } +} + +data "google_compute_default_service_account" "account" { + project = var.project_id +} + +data "google_compute_image" "image" { + name = var.machine_image.name + family = var.machine_image.family + project = var.machine_image.project +} + +module "resource_policy" { + source = "../resource_policy" + count = var.use_compact_placement_policy ? 1 : 0 + + project_id = var.project_id + region = var.region + new_resource_policy_name = var.existing_resource_policy_name == null ? var.resource_prefix : null + existing_resource_policy_name = var.existing_resource_policy_name == null ? null : var.existing_resource_policy_name +} + +resource "google_compute_instance_template" "template" { + provider = google-beta + + labels = var.labels + machine_type = var.machine_type + metadata = local.metadata + name = var.use_static_naming ? var.resource_prefix : null + name_prefix = var.use_static_naming ? null : var.resource_prefix + project = var.project_id + region = var.region + resource_policies = var.use_compact_placement_policy ? [ + module.resource_policy[0].resource_self_link + ] : [] + + disk { + auto_delete = true + boot = true + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + source_image = data.google_compute_image.image.self_link + } + + dynamic "network_interface" { + for_each = toset(range(length(var.subnetwork_self_links))) + content { + network = var.network_self_links[network_interface.value] + nic_type = local.nic_type + subnetwork = var.subnetwork_self_links[network_interface.value] + subnetwork_project = var.project_id + + dynamic "access_config" { + for_each = network_interface.value == 0 ? [1] : [] + content { + nat_ip = null + network_tier = null + public_ptr_domain_name = null + } + } + } + } + + scheduling { + automatic_restart = true + maintenance_interval = var.maintenance_interval + on_host_maintenance = "TERMINATE" + preemptible = false + provisioning_model = null + } + + service_account { + email = local.service_account.email + scopes = local.service_account.scopes + } + + lifecycle { + create_before_destroy = true + ignore_changes = [ + metadata["ssh-keys"], + ] + } +} diff --git a/a3-mega/terraform/modules/common/instance_template/outputs.tf b/a3-mega/terraform/modules/common/instance_template/outputs.tf new file mode 100644 index 000000000..1bf59f379 --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_template/outputs.tf @@ -0,0 +1,32 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +output "id" { + description = "`id` output of the google_compute_instance_template resource created." + value = resource.google_compute_instance_template.template.id +} + +output "name" { + description = "`name` output of the google_compute_instance_template resource created." + value = var.use_static_naming ? var.resource_prefix : resource.google_compute_instance_template.template.name +} + +output "self_link" { + description = "`self_link` output of the google_compute_instance_template resource created" + value = resource.google_compute_instance_template.template.self_link +} + +output "service_account" { value = local.service_account } diff --git a/a3-mega/terraform/modules/common/instance_template/variables.tf b/a3-mega/terraform/modules/common/instance_template/variables.tf new file mode 100644 index 000000000..a4ef627dc --- /dev/null +++ b/a3-mega/terraform/modules/common/instance_template/variables.tf @@ -0,0 +1,275 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "disk_size_gb" { + description = <<-EOT + The size of the image in gigabytes for the boot disk of each instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_size_gb), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-size). + EOT + type = number +} + +variable "disk_type" { + description = <<-EOT + The GCE disk type for the boot disk of each instance. + + Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]` + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--boot-disk-type). + EOT + type = string +} + +variable "machine_image" { + description = <<-EOT + The image with which this disk will initialize. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#source_image). + + ------------ + `machine_image.family` + + The family of images from which the latest non-deprecated image will be selected. Conflicts with `machine_image.name`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-family). + + ------------ + `machine_image.name` + + The name of a specific image. Conflicts with `machine_image.family`. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image). + + ------------ + `machine_image.project` + + The project_id to which this image belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image#project), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--image-project). + EOT + type = object({ + family = string + name = string + project = string + }) + + validation { + condition = ( + var.machine_image != null + // project is non-empty + && alltrue([ + for empty in [null, ""] + : var.machine_image.project != empty + ]) + // at least one is non-empty + && anytrue([ + for value in [var.machine_image.name, var.machine_image.family] + : alltrue([for empty in [null, ""] : value != empty]) + ]) + // at least one is empty + && anytrue([ + for value in [var.machine_image.name, var.machine_image.family] + : anytrue([for empty in [null, ""] : value == empty]) + ]) + ) + error_message = "project must be non-empty exactly one of family or name must be non-empty" + } +} + +variable "machine_type" { + description = <<-EOT + The name of a Google Compute Engine machine type. There are [many possible values](https://cloud.google.com/compute/docs/machine-resource). + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#machine_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--machine-type). + EOT + type = string + + validation { + condition = var.machine_type != null + error_message = "must not be null" + } +} + +variable "maintenance_interval" { + description = <<-EOT + Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for maintenance_interval. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#maintenance_interval). + EOT + type = string + validation { + condition = var.maintenance_interval != null ? contains( + ["PERIODIC"], + var.maintenance_interval, + ) : true + error_message = "'PERIODIC' is th only supported value for maintenance_interval." + } +} + +variable "use_compact_placement_policy" { + description = <<-EOT + The flag to create and use a superblock level compact placement policy for the instances. Currently GCE supports using only 1 placement policy. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#resource_policies). + EOT + type = bool + default = false +} + +variable "existing_resource_policy_name" { + description = <<-EOT + The name of the existing resource policy. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). + EOT + type = string + default = null +} + +variable "metadata" { + description = <<-EOT + Metadata key/value pairs to make available from within instances created from this template. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#metadata), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--metadata). + EOT + type = map(string) +} + +variable "labels" { + description = <<-EOT + A set of key/value label pairs to assign to instances created from this template. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#labels), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--labels). + EOT + type = map(string) +} + +variable "project_id" { + description = <<-EOT + The ID of the project in which the resource belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#project). + EOT + type = string + + validation { + condition = var.project_id != null + error_message = "must not be null" + } +} + +variable "region" { + description = <<-EOT + An instance template is a global resource that is not bound to a zone or a region. However, you can still specify some regional resources in an instance template, which restricts the template to the region where that resource resides. For example, a custom subnetwork resource is tied to a specific region. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#region). + EOT + type = string + + validation { + condition = var.region != null + error_message = "must not be null" + } +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string + + validation { + condition = var.resource_prefix != null + error_message = "must not be null" + } +} + +variable "use_static_naming" { + description = <<-EOT + Flag to determine whether to use static naming for instance_template name. If used static naming, then instance_template cannot be updated. it needs to be destroyed and then recreated. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#name_prefix). + EOT + type = bool + default = false + + validation { + condition = var.use_static_naming != null + error_message = "must not be null" + } +} + +variable "service_account" { + description = <<-EOT + Service account to attach to the instance. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#service_account). + + ------------ + `service_account.email` + + The service account e-mail address. If not given, the default Google Compute Engine service account is used. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#email), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--service-account). + + ------------ + `service_account.scopes` + + A list of service scopes. Both OAuth2 URLs and gcloud short names are supported. To allow full access to all Cloud APIs, use the `"cloud-platform"` scope. See a complete list of scopes [here](https://cloud.google.com/sdk/gcloud/reference/alpha/compute/instances/set-scopes#--scopes). + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_template#scopes), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/instance-templates/create#--scopes). + EOT + type = object({ + email = string, + scopes = set(string) + }) +} + +variable "startup_script" { + description = <<-EOT + Script to run at boot on each instance. This is here for convenience and will just be appended to `metadata` under the key `"startup-script"`. + EOT + type = string +} + +variable "subnetwork_self_links" { + description = "The subnet self-links for all the VPCs." + type = list(string) + + validation { + condition = var.subnetwork_self_links != null + error_message = "must not be null" + } + + validation { + condition = length(var.subnetwork_self_links) != 0 + error_message = "Must have one or more subnetwork self-link" + } +} + +variable "network_self_links" { + description = "The network self-links for all the VPCs." + type = list(string) + + validation { + condition = var.network_self_links != null + error_message = "must not be null" + } + + validation { + condition = length(var.network_self_links) != 0 + error_message = "Must have one or more network self-link" + } +} diff --git a/a3-mega/terraform/modules/common/network/README.md b/a3-mega/terraform/modules/common/network/README.md new file mode 100644 index 000000000..6618f48f1 --- /dev/null +++ b/a3-mega/terraform/modules/common/network/README.md @@ -0,0 +1,67 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [google](#requirement\_google) | >= 3.83 | +| [google-beta](#requirement\_google-beta) | >= 4.12 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_compute_firewall.external-ingress](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | +| [google_compute_firewall.iap-ssh](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | +| [google_compute_firewall.internal-ingress](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | +| [google_compute_firewall.internal-ingress-gpus](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | +| [google_compute_network.gpus](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_network) | resource | +| [google_compute_network.nic0](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_network) | resource | +| [google_compute_subnetwork.gpus](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_subnetwork) | resource | +| [google_compute_subnetwork.nic0](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_subnetwork) | resource | +| [google_compute_network.nic0](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_network) | data source | +| [google_compute_subnetwork.nic0](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [nic0\_existing](#input\_nic0\_existing) | Existing network to attach to nic0. Setting to null will create a new network for it. |
object({
network_name = string
subnetwork_name = string
})
| n/a | yes | +| [project\_id](#input\_project\_id) | The ID of the project in which the resource belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork#project). | `string` | n/a | yes | +| [region](#input\_region) | The region in which the subnetwork(s) has been / will be created.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork#region), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/networks/subnets/create#--region). | `string` | n/a | yes | +| [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [network\_ids](#output\_network\_ids) | Network ids of all the VPCs | +| [network\_names](#output\_network\_names) | Network names of all the VPCs | +| [network\_self\_links](#output\_network\_self\_links) | Network self-links of all the VPCs | +| [subnetwork\_names](#output\_subnetwork\_names) | Subnet names of all the VPCs | +| [subnetwork\_self\_links](#output\_subnetwork\_self\_links) | Subnet self-links of all the VPCs | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/network/main.tf b/a3-mega/terraform/modules/common/network/main.tf new file mode 100644 index 000000000..ecbab58c2 --- /dev/null +++ b/a3-mega/terraform/modules/common/network/main.tf @@ -0,0 +1,178 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + nic0 = { + network = { + id = one(concat( + data.google_compute_network.nic0[*].id, + resource.google_compute_network.nic0[*].id, + )) + name = one(concat( + data.google_compute_network.nic0[*].name, + resource.google_compute_network.nic0[*].name, + )) + self_link = one(concat( + data.google_compute_network.nic0[*].self_link, + resource.google_compute_network.nic0[*].self_link, + )) + } + subnetwork = { + name = one(concat( + data.google_compute_subnetwork.nic0[*].name, + resource.google_compute_subnetwork.nic0[*].name, + )) + self_link = one(concat( + data.google_compute_subnetwork.nic0[*].self_link, + resource.google_compute_subnetwork.nic0[*].self_link, + )) + } + } +} + +// CPU NIC + +data "google_compute_network" "nic0" { + count = var.nic0_existing != null ? 1 : 0 + + name = var.nic0_existing.network_name + project = var.project_id +} + +data "google_compute_subnetwork" "nic0" { + count = var.nic0_existing != null ? 1 : 0 + + name = var.nic0_existing.subnetwork_name + project = var.project_id + region = var.region +} + +resource "google_compute_network" "nic0" { + count = var.nic0_existing != null ? 0 : 1 + + auto_create_subnetworks = false + mtu = 8896 + name = var.resource_prefix + project = var.project_id +} + +resource "google_compute_subnetwork" "nic0" { + count = var.nic0_existing != null ? 0 : 1 + + ip_cidr_range = "10.0.0.0/19" + name = var.resource_prefix + network = google_compute_network.nic0[0].self_link + project = var.project_id + region = var.region +} + +resource "google_compute_firewall" "internal-ingress" { + count = var.nic0_existing != null ? 0 : 1 + + description = "internal ingress traffic (icmp/tcp/udp) to machine on nic0" + direction = "INGRESS" + name = "${var.resource_prefix}-internal-ingress" + network = google_compute_network.nic0[0].self_link + project = var.project_id + source_ranges = ["10.0.0.0/8"] + + allow { + protocol = "icmp" + } + allow { + protocol = "tcp" + ports = ["0-65535"] + } + allow { + protocol = "udp" + ports = ["0-65535"] + } +} + +resource "google_compute_firewall" "external-ingress" { + count = var.nic0_existing != null ? 0 : 1 + + description = "external ingress traffic (icmp) to machine on nic0" + direction = "INGRESS" + name = "${var.resource_prefix}-external-ingress" + network = google_compute_network.nic0[0].self_link + project = var.project_id + source_ranges = ["0.0.0.0/0"] + + allow { + protocol = "icmp" + } +} + +resource "google_compute_firewall" "iap-ssh" { + count = var.nic0_existing != null ? 0 : 1 + + description = "identity-aware proxy ssh traffic to machine on nic0" + direction = "INGRESS" + name = "${var.resource_prefix}-iap-ssh" + network = google_compute_network.nic0[0].self_link + project = var.project_id + source_ranges = ["35.235.240.0/20"] + + allow { + protocol = "tcp" + ports = ["22"] + } +} + +// GPU NICs + +resource "google_compute_network" "gpus" { + count = 8 + + auto_create_subnetworks = false + mtu = 8244 + name = "${var.resource_prefix}-gpu-${count.index}" + project = var.project_id +} + +resource "google_compute_subnetwork" "gpus" { + count = 8 + + ip_cidr_range = "10.${count.index + 1}.0.0/19" + name = "${var.resource_prefix}-gpu-${count.index}" + network = google_compute_network.gpus[count.index].self_link + project = var.project_id + region = var.region +} + +resource "google_compute_firewall" "internal-ingress-gpus" { + count = 8 + + description = "allow internal ingress traffic to gpus on nic${count.index + 1}" + direction = "INGRESS" + name = "${var.resource_prefix}-internal-ingress-gpu-${count.index}" + network = resource.google_compute_network.gpus[count.index].self_link + project = var.project_id + source_ranges = ["10.0.0.0/8"] + + allow { + protocol = "icmp" + } + allow { + protocol = "tcp" + ports = ["0-65535"] + } + allow { + protocol = "udp" + ports = ["0-65535"] + } +} diff --git a/a3-mega/terraform/modules/common/network/outputs.tf b/a3-mega/terraform/modules/common/network/outputs.tf new file mode 100644 index 000000000..b0416b13a --- /dev/null +++ b/a3-mega/terraform/modules/common/network/outputs.tf @@ -0,0 +1,55 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "network_ids" { + description = "Network ids of all the VPCs" + value = concat( + [local.nic0.network.id], + resource.google_compute_network.gpus[*].id, + ) +} + +output "network_self_links" { + description = "Network self-links of all the VPCs" + value = concat( + [local.nic0.network.self_link], + resource.google_compute_network.gpus[*].self_link, + ) +} + +output "network_names" { + description = "Network names of all the VPCs" + value = concat( + [local.nic0.network.name], + resource.google_compute_network.gpus[*].name, + ) +} + +output "subnetwork_self_links" { + description = "Subnet self-links of all the VPCs" + value = concat( + [local.nic0.subnetwork.self_link], + resource.google_compute_subnetwork.gpus[*].self_link, + ) +} + +output "subnetwork_names" { + description = "Subnet names of all the VPCs" + value = concat( + [local.nic0.subnetwork.name], + resource.google_compute_subnetwork.gpus[*].name, + ) +} diff --git a/a3-mega/terraform/modules/common/network/variables.tf b/a3-mega/terraform/modules/common/network/variables.tf new file mode 100644 index 000000000..fbf5526ee --- /dev/null +++ b/a3-mega/terraform/modules/common/network/variables.tf @@ -0,0 +1,46 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "nic0_existing" { + description = "Existing network to attach to nic0. Setting to null will create a new network for it." + type = object({ + network_name = string + subnetwork_name = string + }) +} + +variable "project_id" { + description = <<-EOT + The ID of the project in which the resource belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork#project). + EOT + type = string +} + +variable "region" { + description = <<-EOT + The region in which the subnetwork(s) has been / will be created. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork#region), [gcloud](https://cloud.google.com/sdk/gcloud/reference/compute/networks/subnets/create#--region). + EOT + type = string +} + +variable "resource_prefix" { + description = "Arbitrary string with which all names of newly created resources will be prefixed." + type = string +} diff --git a/a3-mega/terraform/modules/common/network/versions.tf b/a3-mega/terraform/modules/common/network/versions.tf new file mode 100644 index 000000000..71e0d54ee --- /dev/null +++ b/a3-mega/terraform/modules/common/network/versions.tf @@ -0,0 +1,31 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 3.83" + } + + google-beta = { + source = "hashicorp/google-beta" + version = ">= 4.12" + } + } + + required_version = ">= 0.14.0" +} diff --git a/a3-mega/terraform/modules/common/resource_policy/README.md b/a3-mega/terraform/modules/common/resource_policy/README.md new file mode 100644 index 000000000..f4af86c15 --- /dev/null +++ b/a3-mega/terraform/modules/common/resource_policy/README.md @@ -0,0 +1,52 @@ + +Copyright 2022 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [google-beta](#provider\_google-beta) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google-beta_google_compute_resource_policy.new_placement_policy](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_resource_policy) | resource | +| [google-beta_google_compute_resource_policy.existing_placement_policy](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/data-sources/google_compute_resource_policy) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [existing\_resource\_policy\_name](#input\_existing\_resource\_policy\_name) | The name of the existing resource policy.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). | `string` | `null` | no | +| [new\_resource\_policy\_name](#input\_new\_resource\_policy\_name) | The name of the new resource policy to be created.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The ID of the project in which the resource belongs.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#project). | `string` | n/a | yes | +| [region](#input\_region) | The region in which the resource policy(s) has been / will be created.

Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#region). | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [resource\_name](#output\_resource\_name) | The self\_link of the resource policy created. | +| [resource\_self\_link](#output\_resource\_self\_link) | The self\_link of the resource policy created. | + \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/resource_policy/main.tf b/a3-mega/terraform/modules/common/resource_policy/main.tf new file mode 100644 index 000000000..4d51f904f --- /dev/null +++ b/a3-mega/terraform/modules/common/resource_policy/main.tf @@ -0,0 +1,48 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +resource "google_compute_resource_policy" "new_placement_policy" { + provider = google-beta + count = var.existing_resource_policy_name != null ? 0 : 1 + name = var.new_resource_policy_name + project = var.project_id + region = var.region + group_placement_policy { + collocation = "COLLOCATED" + max_distance = 2 + } + + lifecycle { + precondition { + condition = var.existing_resource_policy_name == null || var.new_resource_policy_name == null + error_message = "Both existing_resource_policy_name and new_placement_policy cannot be specified together." + } + } +} + +data "google_compute_resource_policy" "existing_placement_policy" { + provider = google-beta + count = var.existing_resource_policy_name == null ? 0 : 1 + name = var.existing_resource_policy_name + project = var.project_id + region = var.region + lifecycle { + precondition { + condition = var.existing_resource_policy_name == null || var.new_resource_policy_name == null + error_message = "Both existing_resource_policy_name and new_placement_policy cannot be specified together." + } + } +} \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/resource_policy/outputs.tf b/a3-mega/terraform/modules/common/resource_policy/outputs.tf new file mode 100644 index 000000000..9f5b383d2 --- /dev/null +++ b/a3-mega/terraform/modules/common/resource_policy/outputs.tf @@ -0,0 +1,31 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +output "resource_self_link" { + description = "The self_link of the resource policy created." + value = one(concat( + resource.google_compute_resource_policy.new_placement_policy[*].self_link, + data.google_compute_resource_policy.existing_placement_policy[*].self_link, + )) +} + +output "resource_name" { + description = "The self_link of the resource policy created." + value = one(concat( + resource.google_compute_resource_policy.new_placement_policy[*].name, + data.google_compute_resource_policy.existing_placement_policy[*].name, + )) +} \ No newline at end of file diff --git a/a3-mega/terraform/modules/common/resource_policy/variables.tf b/a3-mega/terraform/modules/common/resource_policy/variables.tf new file mode 100644 index 000000000..1350869c4 --- /dev/null +++ b/a3-mega/terraform/modules/common/resource_policy/variables.tf @@ -0,0 +1,54 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "project_id" { + description = <<-EOT + The ID of the project in which the resource belongs. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#project). + EOT + type = string + nullable = false +} + +variable "new_resource_policy_name" { + description = <<-EOT + The name of the new resource policy to be created. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). + EOT + type = string +} + +variable "existing_resource_policy_name" { + description = <<-EOT + The name of the existing resource policy. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#name). + EOT + type = string + default = null +} + +variable "region" { + description = <<-EOT + The region in which the resource policy(s) has been / will be created. + + Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#region). + EOT + type = string + nullable = false +} diff --git a/scripts/entrypoint_helpers.sh b/scripts/entrypoint_helpers.sh index c9f6f5b5b..4388f149b 100644 --- a/scripts/entrypoint_helpers.sh +++ b/scripts/entrypoint_helpers.sh @@ -54,6 +54,7 @@ Parameters: - a2 [docs](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a2-vms) - a3 [blog post](https://cloud.google.com/blog/products/compute/introducing-a3-supercomputers-with-nvidia-h100-gpus) (docs not available yet) + - a3-mega (docs not available yet) var_file Terraform variables file. Defaults to: '${PWD}/input/terraform.tfvars' EOT @@ -163,7 +164,7 @@ entrypoint_helpers::expect_contains () { entrypoint_helpers::validate_args () { local valid=true - declare -ar expected_machine_types=('a2' 'a3') + declare -ar expected_machine_types=('a2' 'a3' 'a3-mega') entrypoint_helpers::expect_contains expected_machine_types arg_machine_type \ || valid=false diff --git a/test/continuous/a3-mega/terraform/modules/cluster/gke/input/simple.tfvars b/test/continuous/a3-mega/terraform/modules/cluster/gke/input/simple.tfvars new file mode 100644 index 000000000..9c8db5236 --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/gke/input/simple.tfvars @@ -0,0 +1,13 @@ +node_pools = [ + { + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + node_count = 0 + }, + { + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + node_count = 0 + }, +] +region = "us-central1" diff --git a/test/continuous/a3-mega/terraform/modules/cluster/gke/tests.sh b/test/continuous/a3-mega/terraform/modules/cluster/gke/tests.sh new file mode 100644 index 000000000..4deec8182 --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/gke/tests.sh @@ -0,0 +1,24 @@ +. ./test/helpers.sh + +a3-mega::terraform::gke::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/cluster/gke" +} + +a3-mega::terraform::gke::input_dir () { + echo "${PWD}/test/continuous/a3-mega/terraform/modules/cluster/gke/input" +} + +test::a3-mega::terraform::gke () { + helpers::terraform_init "$(a3-mega::terraform::gke::src_dir)" +} + +test::a3-mega::terraform::gke::simple () { + local -r tfvars=$(mktemp) + local success=true + helpers::append_tfvars "$(a3-mega::terraform::gke::input_dir)/simple.tfvars" gke >"${tfvars}" + + ./scripts/entrypoint.sh create a3-mega gke "${tfvars}" || success=false + ./scripts/entrypoint.sh destroy a3-mega gke "${tfvars}" || success=false + + [ "${success}" = true ] +} diff --git a/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars b/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars new file mode 100644 index 000000000..02c8252cc --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars @@ -0,0 +1,13 @@ +instance_groups = [ + { + target_size = 0 + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + }, + { + target_size = 0 + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + }, +] +region = "us-central1" diff --git a/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/tests.sh b/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/tests.sh new file mode 100644 index 000000000..d58e28f19 --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/tests.sh @@ -0,0 +1,24 @@ +. ./test/helpers.sh + +a3-mega::terraform::mig-cos::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/cluster/mig-cos" +} + +a3-mega::terraform::mig-cos::input_dir () { + echo "${PWD}/test/continuous/a3-mega/terraform/modules/cluster/mig-cos/input" +} + +test::a3-mega::terraform::mig-cos () { + helpers::terraform_init "$(a3-mega::terraform::mig-cos::src_dir)" +} + +test::a3-mega::terraform::mig-cos::simple () { + local -r tfvars=$(mktemp) + local success=true + helpers::append_tfvars "$(a3-mega::terraform::mig-cos::input_dir)/simple.tfvars" mig-cos >"${tfvars}" + + ./scripts/entrypoint.sh create a3-mega mig-cos "${tfvars}" || success=false + ./scripts/entrypoint.sh destroy a3-mega mig-cos "${tfvars}" || success=false + + [ "${success}" = true ] +} diff --git a/test/continuous/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars b/test/continuous/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars new file mode 100644 index 000000000..02c8252cc --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars @@ -0,0 +1,13 @@ +instance_groups = [ + { + target_size = 0 + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + }, + { + target_size = 0 + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + }, +] +region = "us-central1" diff --git a/test/continuous/a3-mega/terraform/modules/cluster/mig/tests.sh b/test/continuous/a3-mega/terraform/modules/cluster/mig/tests.sh new file mode 100644 index 000000000..af408ee79 --- /dev/null +++ b/test/continuous/a3-mega/terraform/modules/cluster/mig/tests.sh @@ -0,0 +1,24 @@ +. ./test/helpers.sh + +a3-mega::terraform::mig::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/cluster/mig" +} + +a3-mega::terraform::mig::input_dir () { + echo "${PWD}/test/continuous/a3-mega/terraform/modules/cluster/mig/input" +} + +test::a3-mega::terraform::mig () { + helpers::terraform_init "$(a3-mega::terraform::mig::src_dir)" +} + +test::a3-mega::terraform::mig::simple () { + local -r tfvars=$(mktemp) + local success=true + helpers::append_tfvars "$(a3-mega::terraform::mig::input_dir)/simple.tfvars" mig >"${tfvars}" + + ./scripts/entrypoint.sh create a3-mega mig "${tfvars}" || success=false + ./scripts/entrypoint.sh destroy a3-mega mig "${tfvars}" || success=false + + [ "${success}" = true ] +} diff --git a/test/continuous/run.sh b/test/continuous/run.sh index 3c765e53f..ce7fc23d2 100755 --- a/test/continuous/run.sh +++ b/test/continuous/run.sh @@ -3,10 +3,15 @@ . ./test/runner.sh . ./test/continuous/a2/terraform/modules/cluster/mig/tests.sh -#. ./test/continuous/a3/terraform/modules/cluster/gke/tests.sh +# . ./test/continuous/a3/terraform/modules/cluster/gke/tests.sh # . ./test/continuous/a3/terraform/modules/cluster/gke-beta/tests.sh . ./test/continuous/a3/terraform/modules/cluster/mig/tests.sh . ./test/continuous/a3/terraform/modules/cluster/mig-cos/tests.sh +# Removing until a3-mega is released +# . ./test/continuous/a3-mega/terraform/modules/cluster/mig/tests.sh +# . ./test/continuous/a3-mega/terraform/modules/cluster/mig-cos/tests.sh +# . ./test/continuous/a3-mega/terraform/modules/cluster/gke/tests.sh +# . ./test/continuous/a3-mega/terraform/modules/cluster/gke-beta/tests.sh # Removing until slurm actually works on `a3-highgpu-8g` #. ./test/continuous/a3/terraform/modules/cluster/slurm/tests.sh diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-compact-pp.tfvars b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-compact-pp.tfvars new file mode 100644 index 000000000..55b64abe0 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-compact-pp.tfvars @@ -0,0 +1,8 @@ +region = "us-east4" +node_pools = [{ + zone = "us-east4-a" + node_count = 17 + compact_placement_policy = { + new_policy = true + } +}] diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-existing-rp.tfvars b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-existing-rp.tfvars new file mode 100644 index 000000000..a5802f813 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-existing-rp.tfvars @@ -0,0 +1,8 @@ +region = "us-east4" +node_pools = [{ + zone = "us-east4-a" + node_count = 17 + compact_placement_policy = { + existing_policy_name = "test-rp" + } +}] diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-gpu.tfvars b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-gpu.tfvars new file mode 100644 index 000000000..3d17c2d59 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/input/gke-gpu.tfvars @@ -0,0 +1,9 @@ +region = "us-east4" +network_existing = { + network_name = "default" + subnetwork_name = "default" +} +node_pools = [{ + zone = "us-east4-a" + node_count = 1 +}] diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-compact-pp.json b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-compact-pp.json new file mode 100644 index 000000000..a6c795875 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-compact-pp.json @@ -0,0 +1,91 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.kubectl-apply" + }, + { + "address": "module.dashboard" + }, + { + "resources": [ + { + "values": { + "group_placement_policy": [ + { + "availability_domain_count": null, + "collocation": "COLLOCATED", + "max_distance": 2, + "vm_count": null + } + ] + } + } + ], + "address": "module.resource_policy[\"np-0\"]" + } + ], + "resources": [ + { + "address": "google_container_cluster.cluster", + "values": { + "addons_config": [ + { + "gcs_fuse_csi_driver_config": [ + { + "enabled": true + } + ] + } + ], + "datapath_provider": "ADVANCED_DATAPATH", + "enable_multi_networking": true, + "networking_mode": "VPC_NATIVE", + "release_channel": [ + { + "channel": "UNSPECIFIED" + } + ], + "location": "us-east4" + } + }, + { + "address": "google_container_node_pool.node-pools[0]", + "values": { + "node_count": 17, + "node_locations": [ + "us-east4-a" + ], + "network_config": [ + { + "additional_node_network_configs": [], + "additional_pod_network_configs": [], + "create_pod_range": null + } + ], + "node_config": [ + { + "disk_size_gb": 200, + "disk_type": "pd-ssd", + "gvnic": [ + { + "enabled": true + } + ], + "image_type": "COS_CONTAINERD", + "machine_type": "a3-megagpu-8g" + } + ], + "placement_policy": [ + { + "tpu_topology": null, + "type": "COMPACT" + } + ] + } + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-existing-rp.json b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-existing-rp.json new file mode 100644 index 000000000..eefde9f78 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-existing-rp.json @@ -0,0 +1,78 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.kubectl-apply" + }, + { + "address": "module.dashboard" + }, + { + "address": "module.network" + } + ], + "resources": [ + { + "address": "google_container_cluster.cluster", + "values": { + "addons_config": [ + { + "gcs_fuse_csi_driver_config": [ + { + "enabled": true + } + ] + } + ], + "datapath_provider": "ADVANCED_DATAPATH", + "enable_multi_networking": true, + "networking_mode": "VPC_NATIVE", + "release_channel": [ + { + "channel": "UNSPECIFIED" + } + ], + "location": "us-east4" + } + }, + { + "address": "google_container_node_pool.node-pools[0]", + "values": { + "node_count": 17, + "node_locations": [ + "us-east4-a" + ], + "network_config": [ + { + "additional_node_network_configs": [], + "additional_pod_network_configs": [], + "create_pod_range": null + } + ], + "node_config": [ + { + "disk_size_gb": 200, + "disk_type": "pd-ssd", + "gvnic": [ + { + "enabled": true + } + ], + "image_type": "COS_CONTAINERD", + "machine_type": "a3-megagpu-8g" + } + ], + "placement_policy": [ + { + "policy_name": "test-rp", + "tpu_topology": null, + "type": "COMPACT" + } + ] + } + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-gpu.json b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-gpu.json new file mode 100644 index 000000000..001c80c7b --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/output/gke-gpu.json @@ -0,0 +1,68 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.kubectl-apply" + }, + { + "address": "module.dashboard" + } + ], + "resources": [ + { + "address": "google_container_cluster.cluster", + "values": { + "addons_config": [ + { + "gcs_fuse_csi_driver_config": [ + { + "enabled": true + } + ] + } + ], + "datapath_provider": "ADVANCED_DATAPATH", + "enable_multi_networking": true, + "networking_mode": "VPC_NATIVE", + "release_channel": [ + { + "channel": "UNSPECIFIED" + } + ], + "location": "us-east4" + } + }, + { + "address": "google_container_node_pool.node-pools[0]", + "values": { + "node_count": 1, + "node_locations": [ + "us-east4-a" + ], + "network_config": [ + { + "additional_node_network_configs": [], + "additional_pod_network_configs": [], + "create_pod_range": null + } + ], + "node_config": [ + { + "disk_size_gb": 200, + "disk_type": "pd-ssd", + "gvnic": [ + { + "enabled": true + } + ], + "image_type": "COS_CONTAINERD", + "machine_type": "a3-megagpu-8g" + } + ] + } + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/gke/tests.sh b/test/pr/a3-mega/terraform/modules/cluster/gke/tests.sh new file mode 100644 index 000000000..60ae7c087 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/gke/tests.sh @@ -0,0 +1,68 @@ +. ./test/helpers.sh + +a3::terraform::gke::src_dir () { + echo "${PWD}/a3/terraform/modules/cluster/gke" +} + +a3::terraform::gke::input_dir () { + echo "${PWD}/test/pr/a3/terraform/modules/cluster/gke/input" +} + +a3::terraform::gke::output_dir () { + echo "${PWD}/test/pr/a3/terraform/modules/cluster/gke/output" +} + +test::a3::terraform::gke () { + EXPECT_SUCCEED helpers::terraform_init "$(a3::terraform::gke::src_dir)" +} + +test::a3::terraform::gke::gpu_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3::terraform::gke::input_dir)/gke-gpu.tfvars" gke >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3::terraform::gke::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3::terraform::gke::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3::terraform::gke::output_dir)/gke-gpu.json" \ + "${tfshow}" +} + +test::a3::terraform::gke::compact_pp_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3::terraform::gke::input_dir)/gke-compact-pp.tfvars" gke >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3::terraform::gke::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3::terraform::gke::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3::terraform::gke::output_dir)/gke-compact-pp.json" \ + "${tfshow}" +} + +test::a3::terraform::gke::existing_rp_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3::terraform::gke::input_dir)/gke-existing-rp.tfvars" gke >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3::terraform::gke::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3::terraform::gke::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3::terraform::gke::output_dir)/gke-existing-rp.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/existing-rp.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/existing-rp.tfvars new file mode 100644 index 000000000..212e2b6e9 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/existing-rp.tfvars @@ -0,0 +1,31 @@ +instance_groups = [ + { + target_size = 4 + zone = "us-east4-a" + }, + { + target_size = 1 + zone = "us-east4-a" + machine_type = "a3-megagpu-8g" + existing_resource_policy_name = "test-rp" + }, + { + target_size = 1 + zone = "us-east4-a" + machine_type = "a3-megagpu-8g" + existing_resource_policy_name = "test-rp" + }, +] +region = "us-east4" + +use_compact_placement_policy = true +container = { + image = "debian" + cmd = "sleep infinity" + run_at_boot = true + run_options = { + custom = ["--shm-size=250g"] + enable_cloud_logging = true + env = { some_key = "some_value" } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/multi.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/multi.tfvars new file mode 100644 index 000000000..2f6d47dc8 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/multi.tfvars @@ -0,0 +1,24 @@ +instance_groups = [ + { + target_size = 4 + zone = "us-east4-a" + }, + { + target_size = 1 + zone = "us-east4-a" + machine_type = "a3-megagpu-8g" + }, +] +region = "us-east4" + +use_compact_placement_policy = true +container = { + image = "debian" + cmd = "sleep infinity" + run_at_boot = true + run_options = { + custom = ["--shm-size=250g"] + enable_cloud_logging = true + env = { some_key = "some_value" } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars new file mode 100644 index 000000000..1e18301f8 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input/simple.tfvars @@ -0,0 +1,19 @@ +instance_groups = [ + { + target_size = 4 + zone = "us-east4-a" + }, +] +region = "us-east4" + +use_compact_placement_policy = true +container = { + image = "debian" + cmd = "sleep infinity" + run_at_boot = true + run_options = { + custom = ["--shm-size=250g"] + enable_cloud_logging = true + env = { some_key = "some_value" } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/existing-rp.json b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/existing-rp.json new file mode 100644 index 000000000..2d13c3b7e --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/existing-rp.json @@ -0,0 +1,49 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "resources": [ + { + "values": { + "resource_policies": [ + "resourcePolicies/test-rp" + ] + } + } + ], + "address": "module.compute_instance_template[1]" + }, + { + "resources": [ + { + "values": { + "resource_policies": [ + "resourcePolicies/test-rp" + ] + } + } + ], + "address": "module.compute_instance_template[2]" + }, + { + "address": "module.compute_instance_group_manager[0]" + }, + { + "address": "module.compute_instance_group_manager[1]" + }, + { + "address": "module.compute_instance_group_manager[2]" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/modules.json b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/modules.json new file mode 100644 index 000000000..6944f931e --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/modules.json @@ -0,0 +1,19 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_group_manager[0]" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/multimodules.json b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/multimodules.json new file mode 100644 index 000000000..3eb2d8562 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output/multimodules.json @@ -0,0 +1,30 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_template[1]", + "child_modules": [ + { + "address": "module.compute_instance_template[1].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_group_manager[0]" + }, + { + "address": "module.compute_instance_group_manager[1]" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig-cos/tests.sh b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/tests.sh new file mode 100644 index 000000000..5519f1b3b --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig-cos/tests.sh @@ -0,0 +1,68 @@ +. ./test/helpers.sh + +a3-mega::terraform::mig-cos::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/cluster/mig-cos" +} + +a3-mega::terraform::mig-cos::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/cluster/mig-cos/input" +} + +a3-mega::terraform::mig-cos::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/cluster/mig-cos/output" +} + +test::a3-mega::terraform::mig-cos () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::mig-cos::src_dir)" +} + +test::a3-mega::terraform::mig-cos::simple_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig-cos::input_dir)/simple.tfvars" mig-cos >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig-cos::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig-cos::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig-cos::output_dir)/modules.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::mig-cos::multiple_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig-cos::input_dir)/multi.tfvars" mig-cos >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig-cos::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig-cos::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig-cos::output_dir)/multimodules.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::mig-cos::existing_rp_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig-cos::input_dir)/existing-rp.tfvars" mig-cos >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig-cos::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig-cos::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig-cos::output_dir)/existing-rp.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/input/existing-rp.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig/input/existing-rp.tfvars new file mode 100644 index 000000000..dba53297e --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/input/existing-rp.tfvars @@ -0,0 +1,20 @@ +instance_groups = [ + { + target_size = 1 + zone = "us-east4-a" + }, + { + target_size = 1 + zone = "us-east4-a" + machine_type = "a3-megagpu-8g" + existing_resource_policy_name = "test-rp" + }, + { + target_size = 1 + zone = "us-east4-a" + machine_type = "a3-megagpu-8g" + existing_resource_policy_name = "test-rp" + }, +] +region = "us-east4" +use_compact_placement_policy = true diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/input/multi.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig/input/multi.tfvars new file mode 100644 index 000000000..685dc5738 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/input/multi.tfvars @@ -0,0 +1,13 @@ +instance_groups = [ + { + target_size = 1 + zone = "us-central1-a" + }, + { + target_size = 1 + zone = "us-central1-a" + machine_type = "a3-megagpu-8g" + }, +] +region = "us-central1" +use_compact_placement_policy = true diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars b/test/pr/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars new file mode 100644 index 000000000..8f29df926 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/input/simple.tfvars @@ -0,0 +1,8 @@ +instance_groups = [ + { + target_size = 1 + zone = "us-central1-a" + }, +] +region = "us-central1" +use_compact_placement_policy = true diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/output/existing-rp.json b/test/pr/a3-mega/terraform/modules/cluster/mig/output/existing-rp.json new file mode 100644 index 000000000..8ea460d6e --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/output/existing-rp.json @@ -0,0 +1,55 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "resources": [ + { + "values": { + "resource_policies": [ + "resourcePolicies/test-rp" + ] + } + } + ], + "address": "module.compute_instance_template[1]" + }, + { + "resources": [ + { + "values": { + "resource_policies": [ + "resourcePolicies/test-rp" + ] + } + } + ], + "address": "module.compute_instance_template[2]" + }, + { + "address": "module.compute_instance_group_manager[0]" + }, + { + "address": "module.compute_instance_group_manager[1]" + }, + { + "address": "module.compute_instance_group_manager[2]" + }, + { + "address": "module.dashboard" + }, + { + "address": "module.startup" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/output/modules.json b/test/pr/a3-mega/terraform/modules/cluster/mig/output/modules.json new file mode 100644 index 000000000..2c09cf662 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/output/modules.json @@ -0,0 +1,25 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_group_manager[0]" + }, + { + "address": "module.dashboard" + }, + { + "address": "module.startup" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/output/multimodules.json b/test/pr/a3-mega/terraform/modules/cluster/mig/output/multimodules.json new file mode 100644 index 000000000..232d4bf33 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/output/multimodules.json @@ -0,0 +1,36 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.compute_instance_template[0]", + "child_modules": [ + { + "address": "module.compute_instance_template[0].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_template[1]", + "child_modules": [ + { + "address": "module.compute_instance_template[1].module.resource_policy[0]" + } + ] + }, + { + "address": "module.compute_instance_group_manager[0]" + }, + { + "address": "module.compute_instance_group_manager[1]" + }, + { + "address": "module.dashboard" + }, + { + "address": "module.startup" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/cluster/mig/tests.sh b/test/pr/a3-mega/terraform/modules/cluster/mig/tests.sh new file mode 100644 index 000000000..4e2892cb2 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/cluster/mig/tests.sh @@ -0,0 +1,68 @@ +. ./test/helpers.sh + +a3-mega::terraform::mig::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/cluster/mig" +} + +a3-mega::terraform::mig::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/cluster/mig/input" +} + +a3-mega::terraform::mig::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/cluster/mig/output" +} + +test::a3-mega::terraform::mig () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::mig::src_dir)" +} + +test::a3-mega::terraform::mig::simple_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig::input_dir)/simple.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig::output_dir)/modules.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::mig::multiple_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig::input_dir)/multi.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig::output_dir)/multimodules.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::mig::existing_rp_create_modules () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::mig::input_dir)/existing-rp.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::mig::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::mig::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::mig::output_dir)/existing-rp.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/common/dashboard/input/disable.tfvars b/test/pr/a3-mega/terraform/modules/common/dashboard/input/disable.tfvars new file mode 100644 index 000000000..2afab44ab --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/dashboard/input/disable.tfvars @@ -0,0 +1,3 @@ +enable_gce_gke_gpu_utilization_widgets = false +enable_nvidia_dcgm_widgets = false +enable_nvidia_nvml_widgets = false diff --git a/test/pr/a3-mega/terraform/modules/common/dashboard/input/enable.tfvars b/test/pr/a3-mega/terraform/modules/common/dashboard/input/enable.tfvars new file mode 100644 index 000000000..c1ff8ac5e --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/dashboard/input/enable.tfvars @@ -0,0 +1,3 @@ +enable_gce_gke_gpu_utilization_widgets = true +enable_nvidia_dcgm_widgets = true +enable_nvidia_nvml_widgets = true diff --git a/test/pr/a3-mega/terraform/modules/common/dashboard/output/data.json b/test/pr/a3-mega/terraform/modules/common/dashboard/output/data.json new file mode 100644 index 000000000..7a926e846 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/dashboard/output/data.json @@ -0,0 +1,19 @@ +{ + "prior_state": { + "values": { + "root_module": { + "resources": [ + { + "address": "data.http.gce-gke-gpu-utilization[0]" + }, + { + "address": "data.http.nvidia-dcgm[0]" + }, + { + "address": "data.http.nvidia-nvml[0]" + } + ] + } + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/dashboard/output/modules.json b/test/pr/a3-mega/terraform/modules/common/dashboard/output/modules.json new file mode 100644 index 000000000..6d9c43877 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/dashboard/output/modules.json @@ -0,0 +1,11 @@ +{ + "planned_values": { + "root_module": { + "child_modules": [ + { + "address": "module.dashboard" + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/dashboard/tests.sh b/test/pr/a3-mega/terraform/modules/common/dashboard/tests.sh new file mode 100644 index 000000000..9f0fabc0f --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/dashboard/tests.sh @@ -0,0 +1,57 @@ +. ./test/helpers.sh + +a3-mega::terraform::dashboard::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/common/dashboard" +} + +a3-mega::terraform::dashboard::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/dashboard/input" +} + +a3-mega::terraform::dashboard::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/dashboard/output" +} + +test::a3-mega::terraform::dashboard () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::dashboard::src_dir)" +} + +test::a3-mega::terraform::dashboard::disable_all_widgets () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::dashboard::input_dir)/disable.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::dashboard::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::dashboard::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::dashboard::output_dir)/modules.json" \ + "${tfshow}" + EXPECT_SUCCEED helpers::json_omits \ + "$(a3-mega::terraform::dashboard::output_dir)/data.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::dashboard::enable_all_widgets () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::dashboard::input_dir)/enable.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::dashboard::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::dashboard::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::dashboard::output_dir)/modules.json" \ + "${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::dashboard::output_dir)/data.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/common/instance_group_manager/input/simple.tfvars b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/input/simple.tfvars new file mode 100644 index 000000000..a34d54c3b --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/input/simple.tfvars @@ -0,0 +1,4 @@ +zone = "us-central1-a" +instance_template_id = "instance_template_id" +target_size = 1 +wait_for_instances = false diff --git a/test/pr/a3-mega/terraform/modules/common/instance_group_manager/output/resources.json b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/output/resources.json new file mode 100644 index 000000000..8af416bd4 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/output/resources.json @@ -0,0 +1,29 @@ +{ + "planned_values": { + "root_module": { + "resources": [ + { + "address": "google_compute_instance_group_manager.mig", + "values": { + "target_size": 1, + "update_policy": [ + { + "minimal_action": "RESTART", + "replacement_method": "RECREATE", + "type": "PROACTIVE" + } + ], + "version": [ + { + "name": "default", + "target_size": [] + } + ], + "wait_for_instances": false, + "zone": "us-central1-a" + } + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/instance_group_manager/tests.sh b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/tests.sh new file mode 100644 index 000000000..07ae274ae --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_group_manager/tests.sh @@ -0,0 +1,34 @@ +. ./test/helpers.sh + +a3-mega::terraform::instance_group_manager::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/common/instance_group_manager" +} + +a3-mega::terraform::instance_group_manager::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/instance_group_manager/input" +} + +a3-mega::terraform::instance_group_manager::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/instance_group_manager/output" +} + +test::a3-mega::terraform::instance_group_manager () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::instance_group_manager::src_dir)" +} + +test::a3-mega::terraform::instance_group_manager::simple_create_resource () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::instance_group_manager::input_dir)/simple.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::instance_group_manager::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::instance_group_manager::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::instance_group_manager::output_dir)/resources.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/common/instance_template/input/simple.tfvars b/test/pr/a3-mega/terraform/modules/common/instance_template/input/simple.tfvars new file mode 100644 index 000000000..cf94c1196 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_template/input/simple.tfvars @@ -0,0 +1,25 @@ +target_size = 1 +zone = "us-central1-a" + +disk_size_gb = 50 +disk_type = "pd-standard" +labels = {} +machine_image = { + project = "ubuntu-os-cloud" + family = "ubuntu-2204-lts" + name = null +} +machine_type = "n1-standard-8" +maintenance_interval = null +metadata = { + foo = "bar" +} +network_self_links = ["network_self_link"] +region = "us-central1" +service_account = { + email = "foo@bar.xyz" + scopes = ["foobar"] +} +startup_script = "echo hello world" +subnetwork_self_links = ["subnetwork_self_link"] +use_compact_placement_policy = true diff --git a/test/pr/a3-mega/terraform/modules/common/instance_template/output/resources.json b/test/pr/a3-mega/terraform/modules/common/instance_template/output/resources.json new file mode 100644 index 000000000..66ac66d20 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_template/output/resources.json @@ -0,0 +1,71 @@ +{ + "planned_values": { + "root_module": { + "resources": [ + { + "address": "google_compute_instance_template.template", + "values": { + "disk": [ + { + "auto_delete": true, + "boot": true, + "disk_size_gb": 50, + "disk_type": "pd-standard" + } + ], + "machine_type": "n1-standard-8", + "metadata": { + "enable-oslogin": "TRUE", + "foo": "bar", + "install-nvidia-driver": "True", + "startup-script": "echo hello world", + "VmDnsSetting": "ZonalPreferred" + }, + "network_interface": [ + { + "access_config": [ + {} + ], + "network": "network_self_link", + "nic_type": "GVNIC", + "subnetwork": "subnetwork_self_link" + } + ], + "region": "us-central1", + "scheduling": [ + { + "automatic_restart": true, + "on_host_maintenance": "TERMINATE" + } + ], + "service_account": [ + { + "email": "foo@bar.xyz", + "scopes": [ + "foobar" + ] + } + ] + } + } + ], + "child_modules": [ + { + "resources": [ + { + "address": "module.resource_policy[0].google_compute_resource_policy.new_placement_policy[0]", + "values": { + "group_placement_policy": [ + { + "collocation": "COLLOCATED", + "max_distance": 2 + } + ] + } + } + ] + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/instance_template/tests.sh b/test/pr/a3-mega/terraform/modules/common/instance_template/tests.sh new file mode 100644 index 000000000..6beebecb6 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/instance_template/tests.sh @@ -0,0 +1,34 @@ +. ./test/helpers.sh + +a3-mega::terraform::instance_template::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/common/instance_template" +} + +a3-mega::terraform::instance_template::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/instance_template/input" +} + +a3-mega::terraform::instance_template::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/instance_template/output" +} + +test::a3-mega::terraform::instance_template () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::instance_template::src_dir)" +} + +test::a3-mega::terraform::instance_template::simple_create_resource () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::instance_template::input_dir)/simple.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::instance_template::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::instance_template::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::instance_template::output_dir)/resources.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/common/network/input/existing_network.tfvars b/test/pr/a3-mega/terraform/modules/common/network/input/existing_network.tfvars new file mode 100644 index 000000000..a6dc625d7 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/network/input/existing_network.tfvars @@ -0,0 +1,5 @@ +nic0_existing = { + network_name = "default" + subnetwork_name = "default" +} +region = "us-central1" diff --git a/test/pr/a3-mega/terraform/modules/common/network/input/new_network.tfvars b/test/pr/a3-mega/terraform/modules/common/network/input/new_network.tfvars new file mode 100644 index 000000000..46913c3ee --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/network/input/new_network.tfvars @@ -0,0 +1,2 @@ +nic0_existing = null +region = "us-central1" diff --git a/test/pr/a3-mega/terraform/modules/common/network/output/existing_network.json b/test/pr/a3-mega/terraform/modules/common/network/output/existing_network.json new file mode 100644 index 000000000..a272c4c34 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/network/output/existing_network.json @@ -0,0 +1,32 @@ +{ + "planned_values": { + "outputs": { + "network_names": { + "value": [ + "default", + "-gpu-0", + "-gpu-1", + "-gpu-2", + "-gpu-3", + "-gpu-4", + "-gpu-5", + "-gpu-6", + "-gpu-7" + ] + }, + "subnetwork_names": { + "value": [ + "default", + "-gpu-0", + "-gpu-1", + "-gpu-2", + "-gpu-3", + "-gpu-4", + "-gpu-5", + "-gpu-6", + "-gpu-7" + ] + } + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/network/output/new_network.json b/test/pr/a3-mega/terraform/modules/common/network/output/new_network.json new file mode 100644 index 000000000..406262ae5 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/network/output/new_network.json @@ -0,0 +1,30 @@ +{ + "planned_values": { + "outputs": { + "network_names": { + "value": [ + "-gpu-0", + "-gpu-1", + "-gpu-2", + "-gpu-3", + "-gpu-4", + "-gpu-5", + "-gpu-6", + "-gpu-7" + ] + }, + "subnetwork_names": { + "value": [ + "-gpu-0", + "-gpu-1", + "-gpu-2", + "-gpu-3", + "-gpu-4", + "-gpu-5", + "-gpu-6", + "-gpu-7" + ] + } + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/network/tests.sh b/test/pr/a3-mega/terraform/modules/common/network/tests.sh new file mode 100644 index 000000000..02227f602 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/network/tests.sh @@ -0,0 +1,53 @@ +. ./test/helpers.sh + +a3-mega::terraform::network::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/common/network" +} + +a3-mega::terraform::network::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/network/input" +} + +a3-mega::terraform::network::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/network/output" +} + +test::a3-mega::terraform::network () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::network::src_dir)" +} + +test::a3-mega::terraform::network::existing_network () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::network::input_dir)/existing_network.tfvars" null >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::network::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::network::src_dir)" "${tfplan}" >"${tfshow}" + + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::network::output_dir)/existing_network.json" \ + "${tfshow}" +} + +test::a3-mega::terraform::network::new_network () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::network::input_dir)/new_network.tfvars" null >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::network::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::network::src_dir)" "${tfplan}" >"${tfshow}" + + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::network::output_dir)/new_network.json" \ + "${tfshow}" +} diff --git a/test/pr/a3-mega/terraform/modules/common/resource_policy/input/simple.tfvars b/test/pr/a3-mega/terraform/modules/common/resource_policy/input/simple.tfvars new file mode 100644 index 000000000..026368f99 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/resource_policy/input/simple.tfvars @@ -0,0 +1,2 @@ +region = "us-central1" +new_resource_policy_name = "test-policy" diff --git a/test/pr/a3-mega/terraform/modules/common/resource_policy/output/resources.json b/test/pr/a3-mega/terraform/modules/common/resource_policy/output/resources.json new file mode 100644 index 000000000..cba1cc9e9 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/resource_policy/output/resources.json @@ -0,0 +1,21 @@ +{ + "planned_values": { + "root_module": { + "resources": [ + { + "address": "google_compute_resource_policy.new_placement_policy[0]", + "values": { + "group_placement_policy": [ + { + "collocation": "COLLOCATED", + "max_distance": 2, + "vm_count": null + } + ], + "name": "test-policy" + } + } + ] + } + } +} diff --git a/test/pr/a3-mega/terraform/modules/common/resource_policy/tests.sh b/test/pr/a3-mega/terraform/modules/common/resource_policy/tests.sh new file mode 100644 index 000000000..4584af9d6 --- /dev/null +++ b/test/pr/a3-mega/terraform/modules/common/resource_policy/tests.sh @@ -0,0 +1,34 @@ +. ./test/helpers.sh + +a3-mega::terraform::resource_policy::src_dir () { + echo "${PWD}/a3-mega/terraform/modules/common/resource_policy" +} + +a3-mega::terraform::resource_policy::input_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/resource_policy/input" +} + +a3-mega::terraform::resource_policy::output_dir () { + echo "${PWD}/test/pr/a3-mega/terraform/modules/common/resource_policy/output" +} + +test::a3-mega::terraform::resource_policy () { + EXPECT_SUCCEED helpers::terraform_init "$(a3-mega::terraform::resource_policy::src_dir)" +} + +test::a3-mega::terraform::resource_policy::simple_create_resource () { + local -r tfvars=$(mktemp) + helpers::append_tfvars "$(a3-mega::terraform::resource_policy::input_dir)/simple.tfvars" mig >"${tfvars}" + + local -r tfplan=$(mktemp) + EXPECT_SUCCEED helpers::terraform_plan \ + "$(a3-mega::terraform::resource_policy::src_dir)" \ + "${tfvars}" \ + "${tfplan}" + + local -r tfshow=$(mktemp) + helpers::terraform_show "$(a3-mega::terraform::resource_policy::src_dir)" "${tfplan}" >"${tfshow}" + EXPECT_SUCCEED helpers::json_contains \ + "$(a3-mega::terraform::resource_policy::output_dir)/resources.json" \ + "${tfshow}" +} diff --git a/test/pr/run.sh b/test/pr/run.sh index 51d01112d..af702d854 100755 --- a/test/pr/run.sh +++ b/test/pr/run.sh @@ -11,6 +11,14 @@ . ./test/pr/a3/terraform/modules/common/instance_group_manager/tests.sh . ./test/pr/a3/terraform/modules/common/network/tests.sh . ./test/pr/a3/terraform/modules/common/resource_policy/tests.sh +. ./test/pr/a3-mega/terraform/modules/cluster/mig/tests.sh +. ./test/pr/a3-mega/terraform/modules/cluster/mig-cos/tests.sh +. ./test/pr/a3-mega/terraform/modules/cluster/gke/tests.sh +. ./test/pr/a3-mega/terraform/modules/common/dashboard/tests.sh +. ./test/pr/a3-mega/terraform/modules/common/instance_template/tests.sh +. ./test/pr/a3-mega/terraform/modules/common/instance_group_manager/tests.sh +. ./test/pr/a3-mega/terraform/modules/common/network/tests.sh +. ./test/pr/a3-mega/terraform/modules/common/resource_policy/tests.sh . ./test/pr/a2/terraform/modules/cluster/mig/tests.sh . ./test/pr/a2/terraform/modules/common/dashboard/tests.sh . ./test/pr/a2/terraform/modules/common/instance_template/tests.sh From 8e1dd67943816f102b607989e2f992028e329b12 Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Wed, 6 Mar 2024 15:44:33 -0800 Subject: [PATCH 2/6] Fix test --- a3-mega/terraform/modules/cluster/gke/main.tf | 5 ++--- a3/terraform/modules/cluster/gke/main.tf | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/a3-mega/terraform/modules/cluster/gke/main.tf b/a3-mega/terraform/modules/cluster/gke/main.tf index 8bf307abe..196824bf5 100644 --- a/a3-mega/terraform/modules/cluster/gke/main.tf +++ b/a3-mega/terraform/modules/cluster/gke/main.tf @@ -76,13 +76,12 @@ resource "google_container_cluster" "cluster" { # We need to explicitly manage the node pool to enable features such as # auto-upgrade and auto-scaling, but we can't create a cluster with no node - # pool defined. So we create the smallest possible default node pool and + # pool defined. So we create the smallest possible default node pool and # immediately delete it. This is a best-practice suggested in the Terraform # documentation for the container_cluster resource. - remove_default_node_pool = true + remove_default_node_pool = false initial_node_count = 1 min_master_version = local.gke_master_version - deletion_protection = false network = module.network.network_self_links[0] subnetwork = module.network.subnetwork_self_links[0] diff --git a/a3/terraform/modules/cluster/gke/main.tf b/a3/terraform/modules/cluster/gke/main.tf index 0ac888ab1..b9af9ac12 100644 --- a/a3/terraform/modules/cluster/gke/main.tf +++ b/a3/terraform/modules/cluster/gke/main.tf @@ -82,7 +82,6 @@ resource "google_container_cluster" "cluster" { remove_default_node_pool = true initial_node_count = 1 min_master_version = local.gke_master_version - deletion_protection = false network = module.network.network_self_links[0] subnetwork = module.network.subnetwork_self_links[0] From bc23a5d96d2106eb0a27c6091b547b8915d73930 Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Wed, 6 Mar 2024 15:46:54 -0800 Subject: [PATCH 3/6] Remove delete_default_nodepool --- a3-mega/terraform/modules/cluster/gke/main.tf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/a3-mega/terraform/modules/cluster/gke/main.tf b/a3-mega/terraform/modules/cluster/gke/main.tf index 196824bf5..9527c65d0 100644 --- a/a3-mega/terraform/modules/cluster/gke/main.tf +++ b/a3-mega/terraform/modules/cluster/gke/main.tf @@ -79,9 +79,8 @@ resource "google_container_cluster" "cluster" { # pool defined. So we create the smallest possible default node pool and # immediately delete it. This is a best-practice suggested in the Terraform # documentation for the container_cluster resource. - remove_default_node_pool = false - initial_node_count = 1 - min_master_version = local.gke_master_version + initial_node_count = 1 + min_master_version = local.gke_master_version network = module.network.network_self_links[0] subnetwork = module.network.subnetwork_self_links[0] From f18aaaac738c88b87784e80020a7ac5fb8009029 Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Thu, 7 Mar 2024 13:53:14 -0800 Subject: [PATCH 4/6] Add a3-mega to Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 79082c5ba..9ac98a3e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ RUN curl -s "https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terra && mv ./terraform /root/.local/bin/terraform COPY ./a3/terraform ./a3/terraform COPY ./a2/terraform ./a2/terraform - +COPY ./a3-mega/terraform ./a3-mega/terraform FROM base as test COPY test ./test From 039d67a7b1aa8edfe48b28d13423564df53f493a Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Tue, 19 Mar 2024 10:14:10 -0700 Subject: [PATCH 5/6] Add a3-mega to Dockerfile --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 9ac98a3e7..2fade7bc9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,6 +32,8 @@ ENTRYPOINT ["./test/continuous/run.sh"] FROM base as deploy +RUN for cluster in gke mig mig-cos; do \ + terraform -chdir="./a3-mega/terraform/modules/cluster/${cluster}" init; done RUN for cluster in gke gke-beta mig mig-cos slurm; do \ terraform -chdir="./a3/terraform/modules/cluster/${cluster}" init; done RUN for cluster in mig; do \ From b80910d4584bc5f0e7fd5557044e56e3808437ef Mon Sep 17 00:00:00 2001 From: Sam Ho Date: Thu, 2 May 2024 17:56:38 +0000 Subject: [PATCH 6/6] Update NCCL link and rename a3-mega GKE in terraform module (#370) --- README.md | 2 +- a3-mega/examples/gke/main.tf | 4 +++- a3-mega/terraform/modules/cluster/gke/main.tf | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 93798b470..a31c33dc7 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ the same as any other terraform: # assuming the directory containing main.tf is the current working directory # create/update the cluster -terraform init && terraform validate && terraform apply +terraform init && terraform validate && terraform apply -var-file="terraform.tfvars" # destroy the cluster terraform init && terraform validate && terraform apply -destroy diff --git a/a3-mega/examples/gke/main.tf b/a3-mega/examples/gke/main.tf index 9037ea4d3..da098699a 100644 --- a/a3-mega/examples/gke/main.tf +++ b/a3-mega/examples/gke/main.tf @@ -1,11 +1,13 @@ variable "node_pools" {} variable "project_id" {} variable "resource_prefix" {} +variable "region" {} -module "a3-gke" { +module "a3-mega-gke" { source = "github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//a3-mega/terraform/modules/cluster/gke" node_pools = var.node_pools project_id = var.project_id resource_prefix = var.resource_prefix + region = var.region } diff --git a/a3-mega/terraform/modules/cluster/gke/main.tf b/a3-mega/terraform/modules/cluster/gke/main.tf index 9527c65d0..03d2abdb6 100644 --- a/a3-mega/terraform/modules/cluster/gke/main.tf +++ b/a3-mega/terraform/modules/cluster/gke/main.tf @@ -309,7 +309,7 @@ module "kubectl-apply" { daemonsets = { device_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/cmd/nvidia_gpu/device-plugin.yaml" nvidia_driver = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml" - nccl_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-fastrak/nccl-fastrak-installer.yaml" # TODO dead link + nccl_plugin = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml" } enable = var.ksa != null ksa = var.ksa