diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index c7068a4522..fe1d49c4d8 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -211,6 +211,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a | | [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | | [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 0d8171092a..fb875f01e4 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -19,7 +19,16 @@ locals { labels = merge(var.labels, { ghpc_module = "htcondor-execute-point", ghpc_role = "compute" }) } +module "gpu" { + source = "../../../../modules/internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + zones = coalescelist(var.zones, data.google_compute_zones.available.names) network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 50f0cbc6e0..755ded9f61 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,6 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| +| [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a | | [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | ../../internal/slurm-gcp/instance_template | n/a | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index a528978760..c3235c0229 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -17,7 +17,16 @@ locals { labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset-dynamic", ghpc_role = "compute" }) } +module "gpu" { + source = "../../../../modules/internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + nodeset_name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) feature = coalesce(var.feature, local.nodeset_name) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 297c40bb7a..ce82c34172 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -142,7 +142,9 @@ modules. For support with the underlying modules, see the instructions in the ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 84cb60457a..c0a99f99bb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -17,7 +17,16 @@ locals { labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" }) } +module "gpu" { + source = "../../../../modules/internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index b03fbf0973..99078dbcce 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -259,6 +259,7 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | +| [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | ../../internal/slurm-gcp/instance_template | n/a | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index c98813a722..879509f693 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +module "gpu" { + source = "../../../../modules/internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { additional_disks = [ for ad in var.additional_disks : { @@ -67,7 +74,7 @@ module "slurm_controller_template" { enable_shielded_vm = var.enable_shielded_vm shielded_instance_config = var.shielded_instance_config - gpu = one(local.guest_accelerator) + gpu = one(module.gpu.guest_accelerator) machine_type = var.machine_type metadata = local.metadata diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 7160fbdd02..023f4d161b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -71,7 +71,9 @@ modules. For support with the underlying modules, see the instructions in the ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [gpu](#module\_gpu) | ../../../../modules/internal/gpu-definition | n/a | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index 1632116209..6568996e75 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -17,7 +17,16 @@ locals { labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-login", ghpc_role = "scheduler" }) } +module "gpu" { + source = "../../../../modules/internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index d2715ff652..a1fcaa8f01 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -295,6 +295,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| +| [gpu](#module\_gpu) | ../../internal/gpu-definition | n/a | | [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a | ## Resources diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index f1999cbd0b..c91c791393 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -27,7 +27,16 @@ locals { } } +module "gpu" { + source = "../../internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + has_gpu = length(local.guest_accelerator) > 0 allocatable_gpu_per_node = local.has_gpu ? max(local.guest_accelerator[*].count...) : -1 gpu_taint = local.has_gpu ? [{ diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf index 37b92a2f1a..cb24e4204c 100644 --- a/modules/compute/gke-node-pool/reservation_definitions.tf +++ b/modules/compute/gke-node-pool/reservation_definitions.tf @@ -48,6 +48,7 @@ data "google_compute_reservation" "specific_reservations" { } locals { + generated_guest_accelerator = module.gpu.machine_type_guest_accelerator reservation_resource_api_label = "compute.googleapis.com/reservation-name" input_specific_reservations_count = try(length(var.reservation_affinity.specific_reservations), 0) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 8fe80e1cdc..e75b70865d 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,6 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| +| [gpu](#module\_gpu) | ../../internal/gpu-definition | n/a | | [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | ../../scripts/startup-script | n/a | ## Resources diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/modules/compute/vm-instance/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index dcb43fe91a..9b74632678 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -19,7 +19,16 @@ locals { labels = merge(var.labels, { ghpc_module = "vm-instance", ghpc_role = "compute" }) } +module "gpu" { + source = "../../internal/gpu-definition" + + machine_type = var.machine_type + guest_accelerator = var.guest_accelerator +} + locals { + guest_accelerator = module.gpu.guest_accelerator + native_fstype = [] startup_script = local.startup_from_network_storage != null ? ( { startup-script = local.startup_from_network_storage }) : {} diff --git a/modules/internal/gpu-definition/README.md b/modules/internal/gpu-definition/README.md new file mode 100644 index 0000000000..29a87cab78 --- /dev/null +++ b/modules/internal/gpu-definition/README.md @@ -0,0 +1,47 @@ + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | + +## Providers + +No providers. + +## Modules + +No modules. + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({| `[]` | no | +| [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [guest\_accelerator](#output\_guest\_accelerator) | Sanitized list of the type and count of accelerator cards attached to the instance. | +| [machine\_type\_guest\_accelerator](#output\_machine\_type\_guest\_accelerator) | List of the type and count of accelerator cards attached to the specified machine type. | + diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/modules/internal/gpu-definition/main.tf similarity index 75% rename from community/modules/compute/htcondor-execute-point/gpu_definition.tf rename to modules/internal/gpu-definition/main.tf index 1c84a92721..bc66442e5e 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/modules/internal/gpu-definition/main.tf @@ -1,5 +1,5 @@ /** - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,20 @@ * limitations under the License. */ -## Required variables: -# guest_accelerator -# machine_type +variable "machine_type" { + description = "Machine type to use for the instance creation" + type = string +} + +variable "guest_accelerator" { + description = "List of the type and count of accelerator cards attached to the instance." + type = list(object({ + type = string, + count = number + })) + default = [] + nullable = false +} locals { # example state; terraform will ignore diffs if last element of URL matches @@ -56,3 +67,17 @@ locals { # (3) default to empty list if both are empty guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } + +output "guest_accelerator" { + description = "Sanitized list of the type and count of accelerator cards attached to the instance." + value = local.guest_accelerator +} + +output "machine_type_guest_accelerator" { + description = "List of the type and count of accelerator cards attached to the specified machine type." + value = local.generated_guest_accelerator +} + +terraform { + required_version = ">= 1.3" +} diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 703f00ff95..5a7c83ac1e 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -36,16 +36,9 @@ "modules/compute/vm-instance/startup_from_network_storage.tf", ], [ - "modules/compute/vm-instance/gpu_definition.tf", - "community/modules/compute/htcondor-execute-point/gpu_definition.tf", "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf", - "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf", - "community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf", - "modules/compute/gke-node-pool/gpu_definition.tf", ], [ "modules/compute/gke-node-pool/threads_per_core_calc.tf",
type = string,
count = number
}))