Skip to content

Commit

Permalink
Merge branch 'release-candidate' into health_out
Browse files Browse the repository at this point in the history
  • Loading branch information
mr0re1 authored Nov 15, 2024
2 parents 0036d66 + f62f2bc commit 684c035
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ limitations under the License.
| <a name="module_daos_network_storage_scripts"></a> [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a |
| <a name="module_nodeset_cleanup"></a> [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a |
| <a name="module_nodeset_cleanup_tpu"></a> [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a |
| <a name="module_slurm_controller_instance"></a> [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 |
| <a name="module_slurm_controller_template"></a> [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 |
| <a name="module_slurm_files"></a> [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a |
| <a name="module_slurm_login_instance"></a> [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 |
Expand All @@ -250,6 +249,7 @@ limitations under the License.

| Name | Type |
|------|------|
| [google_compute_instance_from_template.controller](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource |
| [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource |
| [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource |
| [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ module "slurm_controller_template" {
metadata = local.metadata
min_cpu_platform = var.min_cpu_platform

# network_ip = TODO: add support for network_ip
on_host_maintenance = var.on_host_maintenance
preemptible = var.preemptible
service_account = local.service_account
Expand All @@ -82,40 +81,38 @@ module "slurm_controller_template" {
source_image_project = local.source_image_project_normalized # requires source_image_logic.tf
source_image = local.source_image # requires source_image_logic.tf

# spot = TODO: add support for spot (?)
subnetwork = var.subnetwork_self_link

tags = concat([local.slurm_cluster_name], var.tags)
# termination_action = TODO: add support for termination_action (?)
}

# INSTANCE
locals {
# TODO: add support for proper access_config
access_config = {
nat_ip = null
network_tier = null
resource "google_compute_instance_from_template" "controller" {
name = "${local.slurm_cluster_name}-controller"
project = var.project_id
zone = var.zone
source_instance_template = module.slurm_controller_template.self_link

allow_stopping_for_update = true

# Can't rely on template to specify nics due to usage of static_ip
network_interface {
dynamic "access_config" {
for_each = var.enable_controller_public_ips ? ["unit"] : []
content {
nat_ip = null
network_tier = null
}
}
network_ip = length(var.static_ips) == 0 ? "" : var.static_ips[0]
subnetwork = var.subnetwork_self_link
}
}

module "slurm_controller_instance" {
source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.5"

access_config = var.enable_controller_public_ips ? [local.access_config] : []
add_hostname_suffix = false
hostname = "${local.slurm_cluster_name}-controller"
instance_template = module.slurm_controller_template.self_link

project_id = var.project_id
region = var.region
slurm_cluster_name = local.slurm_cluster_name
slurm_instance_role = "controller"
static_ips = var.static_ips
subnetwork = var.subnetwork_self_link
zone = var.zone
metadata = var.metadata

labels = local.labels
moved {
from = module.slurm_controller_instance.google_compute_instance_from_template.slurm_instance[0]
to = google_compute_instance_from_template.controller
}

# SECRETS: CLOUDSQL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,5 @@ module "slurm_login_instance" {
zone = each.value.zone

# trigger replacement of login nodes when the controller instance is replaced
replace_trigger = module.slurm_controller_instance.instances_self_links[0]
replace_trigger = google_compute_instance_from_template.controller.self_link
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ output "slurm_cluster_name" {

output "slurm_controller_instance" {
description = "Compute instance of controller node"
value = module.slurm_controller_instance.slurm_instances[0]
value = google_compute_instance_from_template.controller
}

output "slurm_login_instances" {
Expand All @@ -36,6 +36,6 @@ output "instructions" {
description = "Post deployment instructions."
value = <<-EOT
To SSH to the controller (may need to add '--tunnel-through-iap'):
gcloud compute ssh ${module.slurm_controller_instance.instances_self_links[0]}
gcloud compute ssh ${google_compute_instance_from_template.controller.self_link}
EOT
}
3 changes: 2 additions & 1 deletion modules/compute/gke-node-pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ resource "google_container_node_pool" "node_pool" {
gpu_partition_size = try(ga.value.gpu_partition_size, null)

dynamic "gpu_driver_installation_config" {
for_each = try([ga.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }])
# in case user did not specify guest_accelerator settings, we need a try to default to []
for_each = try([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }])
iterator = gdic
content {
gpu_driver_version = gdic.value.gpu_driver_version
Expand Down

0 comments on commit 684c035

Please sign in to comment.