From 46dac6f226d8b1018765ce4a0363ea9f18ce9763 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 12 Sep 2022 15:22:19 -0500 Subject: [PATCH 01/51] Address permadiff in vm-instance module When used to provision A2 machine types, using the empty list as the default value of guest_accelerator causes a permadiff because the Compute Engine API reports the A2's A100 GPU card and we have chosen an explicit value. If, instead, we use a null value for guest_accelerator, Terraform treats the difference as not requiring action. --- modules/compute/vm-instance/README.md | 2 +- modules/compute/vm-instance/main.tf | 2 +- modules/compute/vm-instance/variables.tf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index a3dc343698..34b390d1f9 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -140,7 +140,7 @@ No modules. | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `null` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | | [instance\_image](#input\_instance\_image) | Instance Image |
object({
family = string,
project = string
})
|
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. List key, value pairs. | `any` | n/a | yes | diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 8dc362db09..213b453e5c 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -31,7 +31,7 @@ locals { # compact_placement : true when placement policy is provided and collocation set; false if unset compact_placement = try(var.placement_policy.collocation, null) != null - gpu_attached = contains(["a2"], local.machine_family) || length(var.guest_accelerator) > 0 + gpu_attached = contains(["a2"], local.machine_family) || var.guest_accelerator != null # both of these must be false if either compact placement or preemptible/spot instances are used # automatic restart is tolerant of GPUs while on host maintenance is not diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index 2cbdceaf10..d5e5126e2c 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -153,7 +153,7 @@ variable "guest_accelerator" { type = string, count = number })) - default = [] + default = null } variable "on_host_maintenance" { From e8848c1e5653e37bbd63a4201734ed95f4e266db Mon Sep 17 00:00:00 2001 From: Carlos Boneti Date: Fri, 9 Sep 2022 18:32:26 -0700 Subject: [PATCH 02/51] Exposing enable_reconfigure in Slurm-onGCP V5 --- .../schedmd-slurm-gcp-v5-controller/README.md | 1 + .../schedmd-slurm-gcp-v5-controller/main.tf | 1 + .../schedmd-slurm-gcp-v5-controller/variables.tf | 13 +++++++++++++ 3 files changed, 15 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 56a4830e3d..82f86e7a55 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -122,6 +122,7 @@ No resources. | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.
NOTE: Requires Python and Google Pub/Sub API.
*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
type : the GPU type
count : number of GPUs |
object({
type = string
count = number
})
| `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 52b14457ed..5d5a08f9a2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -54,6 +54,7 @@ module "slurm_controller_instance" { enable_devel = var.enable_devel enable_cleanup_compute = var.enable_cleanup_compute enable_cleanup_subscriptions = var.enable_cleanup_subscriptions + enable_reconfigure = var.enable_reconfigure enable_bigquery_load = var.enable_bigquery_load epilog_scripts = var.epilog_scripts disable_default_mounts = var.disable_default_mounts diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 1c1fe0525a..83842ba18b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -188,6 +188,19 @@ variable "enable_cleanup_subscriptions" { default = false } +variable "enable_reconfigure" { + description = < Date: Thu, 8 Sep 2022 17:27:36 -0300 Subject: [PATCH 03/51] Add customized version output --- Makefile | 5 ++++- cmd/root.go | 25 +++++++++++++++++++++++-- ghpc.go | 8 ++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 59c4b9b3b6..499c437de2 100644 --- a/Makefile +++ b/Makefile @@ -13,12 +13,15 @@ MIN_GOLANG_VERSION=1.16 # for building ghpc ENG = ./cmd/... ./pkg/... TERRAFORM_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.tf" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) PACKER_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.pkr.hcl" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) +GIT_TAG_VERSION=$(shell git tag --points-at HEAD) +GIT_BRANCH=$(shell git branch --show-current) +GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long) # RULES MEANT TO BE USED DIRECTLY ghpc: warn-go-version warn-terraform-version warn-packer-version $(shell find ./cmd ./pkg ghpc.go -type f) $(info **************** building ghpc ************************) - go build ghpc.go + @go build -ldflags="-X 'main.gitTagVersion=$(GIT_TAG_VERSION)' -X 'main.gitBranch=$(GIT_BRANCH)' -X 'main.gitCommitInfo=$(GIT_COMMIT_INFO)'" ghpc.go install-user: $(info ******** installing ghpc in ~/bin *********************) diff --git a/cmd/root.go b/cmd/root.go index ce19798792..a53aadcc9e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -23,8 +23,16 @@ import ( "github.com/spf13/cobra" ) +// Git references when use Makefile var ( - rootCmd = &cobra.Command{ + GitTagVersion string + GitBranch string + GitCommitInfo string +) + +var ( + annotation = make(map[string]string) + rootCmd = &cobra.Command{ Use: "ghpc", Short: "A blueprint and deployment engine for HPC clusters in GCP.", Long: `gHPC provides a flexible and simple to use interface to accelerate @@ -34,12 +42,25 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.4.2", + Version: "v1.4.2", + Annotations: annotation, } ) // Execute the root command func Execute() error { + if len(GitBranch) > 0 { + if len(GitTagVersion) == 0 { + GitTagVersion = "- not built from oficial release" + } + annotation["version"] = GitTagVersion + annotation["branch"] = GitBranch + annotation["commitInfo"] = GitCommitInfo + rootCmd.SetVersionTemplate(`ghpc version {{index .Annotations "version"}} +Built from '{{index .Annotations "branch"}}' branch. +Commit info: {{index .Annotations "commitInfo"}} +`) + } return rootCmd.Execute() } diff --git a/ghpc.go b/ghpc.go index b4443c2642..2f676fe10d 100644 --- a/ghpc.go +++ b/ghpc.go @@ -25,8 +25,16 @@ import ( //go:embed modules community/modules var moduleFS embed.FS +// Git references when use Makefile +var gitTagVersion string +var gitBranch string +var gitCommitInfo string + func main() { sourcereader.ModuleFS = moduleFS + cmd.GitTagVersion = gitTagVersion + cmd.GitBranch = gitBranch + cmd.GitCommitInfo = gitCommitInfo if err := cmd.Execute(); err != nil { os.Exit(1) } From 19c57c2ad0142f28a96465cbe826f250f2295891 Mon Sep 17 00:00:00 2001 From: Thiago Sgobe Date: Fri, 9 Sep 2022 15:11:17 -0300 Subject: [PATCH 04/51] handling detached HEAD scenarios --- cmd/root.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmd/root.go b/cmd/root.go index a53aadcc9e..177b304cd0 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -49,10 +49,13 @@ HPC deployments on the Google Cloud Platform.`, // Execute the root command func Execute() error { - if len(GitBranch) > 0 { + if len(GitCommitInfo) > 0 { if len(GitTagVersion) == 0 { GitTagVersion = "- not built from oficial release" } + if len(GitBranch) == 0 { + GitBranch = "detached HEAD" + } annotation["version"] = GitTagVersion annotation["branch"] = GitBranch annotation["commitInfo"] = GitCommitInfo From f33ec2625990d519cd90f0567e841978034b66d1 Mon Sep 17 00:00:00 2001 From: Thiago Sgobe Date: Fri, 9 Sep 2022 19:56:33 -0300 Subject: [PATCH 05/51] Add git checks into Makefile --- Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile b/Makefile index 499c437de2..7842de65ee 100644 --- a/Makefile +++ b/Makefile @@ -13,9 +13,16 @@ MIN_GOLANG_VERSION=1.16 # for building ghpc ENG = ./cmd/... ./pkg/... TERRAFORM_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.tf" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) PACKER_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.pkr.hcl" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) + +ifneq (, $(shell which git)) +## GIT IS PRESENT +ifneq (,$(wildcard .git)) +## GIT DIRECTORY EXISTS GIT_TAG_VERSION=$(shell git tag --points-at HEAD) GIT_BRANCH=$(shell git branch --show-current) GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long) +endif +endif # RULES MEANT TO BE USED DIRECTLY From 4fd22e3ec8eddf34d7a278ef75d4b20fc3821396 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 12 Sep 2022 15:18:46 -0700 Subject: [PATCH 06/51] Run shell runners as executable --- .../scripts/startup-script/templates/startup-script-custom.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/scripts/startup-script/templates/startup-script-custom.tpl b/modules/scripts/startup-script/templates/startup-script-custom.tpl index 8dad11cc3c..63c5996a79 100644 --- a/modules/scripts/startup-script/templates/startup-script-custom.tpl +++ b/modules/scripts/startup-script/templates/startup-script-custom.tpl @@ -39,7 +39,7 @@ stdlib::runner() { stdlib::info "=== start executing runner: $object ===" case "$1" in ansible-local) stdlib::run_playbook "$destpath/$filename" "$args";; - shell) . $destpath/$filename $args;; + shell) chmod u+x /$destpath/$filename && ./$destpath/$filename $args;; esac exit_code=$? From a329d75179337a87f8d8cbb450d0eae405c478a3 Mon Sep 17 00:00:00 2001 From: Carlos Boneti Date: Mon, 12 Sep 2022 18:38:42 -0700 Subject: [PATCH 07/51] Adding Slurm on GCP V4 static nodes functionality --- .../SchedMD-slurm-on-gcp-controller/README.md | 1 + .../SchedMD-slurm-on-gcp-controller/main.tf | 23 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md index ff3787ad52..6d2ac49cd9 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md @@ -76,6 +76,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| +| [slurm\_cluster\_compute\_node](#module\_slurm\_cluster\_compute\_node) | github.com/SchedMD/slurm-gcp//tf/modules/compute/ | v4.2.0 | | [slurm\_cluster\_controller](#module\_slurm\_cluster\_controller) | github.com/SchedMD/slurm-gcp//tf/modules/controller/ | v4.2.0 | ## Resources diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf index 8c2ae382a8..ff9dda67dd 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf @@ -16,6 +16,7 @@ locals { controller_startup_script = var.controller_startup_script != null ? var.controller_startup_script : var.startup_script compute_startup_script = var.compute_startup_script != null ? var.compute_startup_script : var.startup_script + cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" } data "google_compute_image" "compute_image" { @@ -29,7 +30,7 @@ module "slurm_cluster_controller" { boot_disk_type = var.boot_disk_type image = data.google_compute_image.compute_image.self_link instance_template = var.controller_instance_template - cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" + cluster_name = local.cluster_name compute_node_scopes = var.compute_node_scopes compute_node_service_account = var.compute_node_service_account disable_compute_public_ips = var.disable_compute_public_ips @@ -58,3 +59,23 @@ module "slurm_cluster_controller" { intel_select_solution = var.intel_select_solution cloudsql = var.cloudsql } + +module "slurm_cluster_compute_node" { + source = "github.com/SchedMD/slurm-gcp//tf/modules/compute/?ref=v4.2.0" + project = var.project_id + cluster_name = local.cluster_name + region = var.region + zone = var.zone + controller_name = module.slurm_cluster_controller.controller_node_name + controller_secondary_disk = var.controller_secondary_disk + disable_compute_public_ips = var.disable_compute_public_ips + network_storage = var.network_storage + partitions = var.partition + compute_startup_script = local.compute_startup_script + scopes = var.compute_node_scopes + service_account = var.compute_node_service_account + shared_vpc_host_project = var.shared_vpc_host_project + subnetwork_name = var.subnetwork_name + intel_select_solution = var.intel_select_solution + munge_key = var.munge_key +} From 545517705e2e71728e91abfea5e94205568f0cbc Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Tue, 13 Sep 2022 18:35:41 +0000 Subject: [PATCH 08/51] Set enable_smt default to false for slurm v5 modules --- .../modules/compute/schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/variables.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/README.md | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index d977d584fd..bcd0dc8020 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -85,7 +85,7 @@ No resources. | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-standard"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 03230e5c1f..c4dc33c8ec 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -231,7 +231,7 @@ variable "can_ip_forward" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "labels" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 82f86e7a55..7d939973e4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -112,7 +112,7 @@ No resources. | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
* /usr/local/etc/slurm
* /etc/munge
* /home
* /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-ssd"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 83842ba18b..793a1c3573 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -127,7 +127,7 @@ variable "disable_default_mounts" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "disk_type" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index e5d75fb538..cd58c795e5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -89,7 +89,7 @@ No resources. | [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance, typically
supplied as an output of the controler module. | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-standard"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 0f565de63b..249d717f48 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -31,7 +31,7 @@ variable "labels" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "deployment_name" { From 47b4c7c20e129d156e71ac76e9a0222a5a00e205 Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Tue, 13 Sep 2022 14:08:45 -0700 Subject: [PATCH 09/51] Default scope now allows reading AND writing. Previously the scope for the vm only allowed it to read from GCS. Now both read and writes are allowed. --- modules/compute/vm-instance/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index 4a26d441db..83b97bed14 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -97,7 +97,7 @@ variable "service_account" { }) default = { email = null - scopes = ["https://www.googleapis.com/auth/devstorage.read_only", + scopes = ["https://www.googleapis.com/auth/devstorage.read_write", "https://www.googleapis.com/auth/logging.write", "https://www.googleapis.com/auth/monitoring.write", "https://www.googleapis.com/auth/servicecontrol", From 0dca67a315db6b7b2fa165accfeecaac5ddcad77 Mon Sep 17 00:00:00 2001 From: Karim Roukoz Date: Tue, 13 Sep 2022 22:04:47 +0000 Subject: [PATCH 10/51] remove "kind:" from examples and docs where optional --- community/examples/cloud-batch.yaml | 5 ----- .../examples/hpc-cluster-small-sharedvpc.yaml | 6 ------ community/examples/htcondor-pool.yaml | 10 ---------- community/examples/intel/daos-cluster.yaml | 3 --- community/examples/intel/daos-slurm.yaml | 8 -------- .../examples/intel/hpc-cluster-intel-select.yaml | 9 --------- community/examples/omnia-cluster.yaml | 8 -------- community/examples/slurm-gcp-v5-hpc-centos7.yaml | 6 ------ community/examples/slurm-gcp-v5-ubuntu2004.yaml | 6 ------ community/examples/spack-gromacs.yaml | 8 -------- .../SchedMD-slurm-on-gcp-partition/README.md | 1 - .../compute/htcondor-execute-point/README.md | 1 - .../schedmd-slurm-gcp-v5-partition/README.md | 1 - .../database/slurm-cloudsql-federation/README.md | 1 - community/modules/file-system/Intel-DAOS/README.md | 3 --- community/modules/file-system/nfs-server/README.md | 1 - community/modules/project/new-project/README.md | 1 - .../modules/project/service-account/README.md | 1 - .../modules/project/service-enablement/README.md | 1 - .../SchedMD-slurm-on-gcp-controller/README.md | 1 - .../SchedMD-slurm-on-gcp-login-node/README.md | 1 - .../modules/scheduler/cloud-batch-job/README.md | 6 ------ .../scheduler/cloud-batch-login-node/README.md | 2 -- .../modules/scheduler/htcondor-configure/README.md | 3 --- .../schedmd-slurm-gcp-v5-controller/README.md | 1 - .../schedmd-slurm-gcp-v5-hybrid/README.md | 1 - .../scheduler/schedmd-slurm-gcp-v5-login/README.md | 1 - .../modules/scripts/htcondor-install/README.md | 3 --- community/modules/scripts/spack-install/README.md | 3 --- .../modules/scripts/wait-for-startup/README.md | 1 - docs/tutorials/gromacs/spack-gromacs.yaml | 7 ------- .../intel-select/hpc-cluster-intel-select.yaml | 8 -------- docs/tutorials/openfoam/spack-openfoam.yaml | 7 ------- docs/tutorials/wrfv3/spack-wrfv3.yaml | 7 ------- examples/README.md | 2 +- examples/hpc-cluster-high-io.yaml | 9 --------- examples/image-builder.yaml | 6 ------ modules/README.md | 7 ------- modules/compute/vm-instance/README.md | 1 - modules/file-system/filestore/README.md | 2 -- .../pre-existing-network-storage/README.md | 1 - modules/monitoring/dashboard/README.md | 1 - modules/network/pre-existing-vpc/README.md | 1 - modules/network/vpc/README.md | 1 - modules/scripts/startup-script/README.md | 2 -- .../blueprints/lustre-with-new-vpc.yaml | 7 ------- .../daily-tests/blueprints/monitoring.yaml | 5 ----- .../test_configs/2-nfs-servers.yaml | 3 --- .../test_configs/2filestore-4instances.yaml | 8 -------- .../validate_configs/test_configs/centos8-ss.yaml | 6 ------ .../cloud-batch-cft-instance-template.yaml | 5 ----- .../test_configs/complex-data.yaml | 4 ---- .../validate_configs/test_configs/dashboards.yaml | 2 -- tools/validate_configs/test_configs/debian-ss.yaml | 6 ------ .../test_configs/exascaler-existing-vpc.yaml | 2 -- .../test_configs/exascaler-new-vpc.yaml | 2 -- tools/validate_configs/test_configs/gpu.yaml | 2 -- .../test_configs/hpc-centos-ss.yaml | 6 ------ .../hpc-cluster-high-io-remote-state.yaml | 7 ------- .../test_configs/hpc-cluster-hybrid-v5.yaml | 4 ---- .../test_configs/hpc-cluster-project.yaml | 7 ------- .../test_configs/hpc-cluster-service-acct.yaml | 5 ----- .../test_configs/hpc-cluster-simple-nfs-sql.yaml | 6 ------ .../test_configs/hpc-cluster-simple.yaml | 4 ---- .../hpc-cluster-slurm-with-startup.yaml | 6 ------ .../test_configs/hpc-cluster-small-slurm-v5.yaml | 6 ------ .../test_configs/htcondor-pool.yaml | 10 ---------- .../test_configs/instance-with-startup.yaml | 5 ----- .../validate_configs/test_configs/label_test.yaml | 3 --- .../validate_configs/test_configs/new_project.yaml | 1 - .../test_configs/overwrite_labels.yaml | 5 ----- tools/validate_configs/test_configs/packer.yaml | 1 - .../test_configs/pre-existing-fs.yaml | 5 ----- tools/validate_configs/test_configs/rocky-ss.yaml | 6 ------ .../test_configs/simple-startup.yaml | 4 ---- .../slurm-two-partitions-workstation.yaml | 7 ------- .../test_configs/spack-buildcache.yaml | 4 ---- .../test_configs/spack-environments.yaml | 4 ---- .../test_configs/startup-options.yaml | 6 ------ .../test_configs/test_outputs.yaml | 14 -------------- .../test_configs/threads_per_core.yaml | 11 ----------- tools/validate_configs/test_configs/ubuntu-ss.yaml | 6 ------ .../test_configs/use-resources.yaml | 7 ------- .../test_configs/vm-instance-local-ssd.yaml | 3 --- 84 files changed, 1 insertion(+), 369 deletions(-) diff --git a/community/examples/cloud-batch.yaml b/community/examples/cloud-batch.yaml index 9e5c10e5a6..a244781950 100644 --- a/community/examples/cloud-batch.yaml +++ b/community/examples/cloud-batch.yaml @@ -29,17 +29,14 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: appfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: {local_mount: /sw} - id: hello-startup-script source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -55,7 +52,6 @@ deployment_groups: - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform use: [network1, appfs, hello-startup-script] settings: runnable: "cat /sw/hello.txt" @@ -66,6 +62,5 @@ deployment_groups: - id: batch-login source: community/modules/scheduler/cloud-batch-login-node - kind: terraform use: [batch-job] outputs: [instructions] diff --git a/community/examples/hpc-cluster-small-sharedvpc.yaml b/community/examples/hpc-cluster-small-sharedvpc.yaml index e70e22ba80..c1b920f1d4 100644 --- a/community/examples/hpc-cluster-small-sharedvpc.yaml +++ b/community/examples/hpc-cluster-small-sharedvpc.yaml @@ -43,7 +43,6 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform settings: project_id: $(vars.host_project_id) network_name: your-shared-network @@ -51,7 +50,6 @@ deployment_groups: - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -61,7 +59,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -75,7 +72,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -85,7 +81,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -97,7 +92,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/examples/htcondor-pool.yaml b/community/examples/htcondor-pool.yaml index f4d82168cd..b12b00e8d1 100644 --- a/community/examples/htcondor-pool.yaml +++ b/community/examples/htcondor-pool.yaml @@ -29,7 +29,6 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform settings: network_name: htcondor-pool subnetwork_name: htcondor-pool-usc1 @@ -38,21 +37,17 @@ deployment_groups: - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_services source: community/modules/project/service-enablement - kind: terraform use: - htcondor_install - id: htcondor_configure source: community/modules/scheduler/htcondor-configure - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -63,7 +58,6 @@ deployment_groups: - id: htcondor_cm source: modules/compute/vm-instance - kind: terraform use: - network1 - htcondor_configure_central_manager @@ -80,7 +74,6 @@ deployment_groups: - id: htcondor_configure_execute_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -91,7 +84,6 @@ deployment_groups: - id: htcondor_execute_point source: community/modules/compute/htcondor-execute-point - kind: terraform use: - network1 - htcondor_configure_execute_point @@ -106,7 +98,6 @@ deployment_groups: - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -130,7 +121,6 @@ deployment_groups: queue - id: htcondor_access source: modules/compute/vm-instance - kind: terraform use: - network1 - htcondor_configure_access_point diff --git a/community/examples/intel/daos-cluster.yaml b/community/examples/intel/daos-cluster.yaml index 0fab4f4431..f930d980ff 100644 --- a/community/examples/intel/daos-cluster.yaml +++ b/community/examples/intel/daos-cluster.yaml @@ -30,14 +30,12 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform # This module creates a DAOS server. Server images MUST be created before running this. # https://github.com/daos-stack/google-cloud-daos/tree/main/images # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: number_of_instances: 2 @@ -48,7 +46,6 @@ deployment_groups: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_client - id: daos-client source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_client?ref=v0.2.1 - kind: terraform use: [network1, daos-server] settings: number_of_instances: 2 diff --git a/community/examples/intel/daos-slurm.yaml b/community/examples/intel/daos-slurm.yaml index beb5598b3b..b392a23ebb 100644 --- a/community/examples/intel/daos-slurm.yaml +++ b/community/examples/intel/daos-slurm.yaml @@ -30,11 +30,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: "/home" @@ -44,7 +42,6 @@ deployment_groups: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} @@ -70,7 +67,6 @@ deployment_groups: - id: daos-client-script source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -89,7 +85,6 @@ deployment_groups: ## This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -102,7 +97,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -112,7 +106,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -129,7 +122,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/examples/intel/hpc-cluster-intel-select.yaml b/community/examples/intel/hpc-cluster-intel-select.yaml index 6e6372a855..962fcc5c5d 100644 --- a/community/examples/intel/hpc-cluster-intel-select.yaml +++ b/community/examples/intel/hpc-cluster-intel-select.yaml @@ -33,10 +33,8 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: startup_controller source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -49,7 +47,6 @@ deployment_groups: - startup_script - id: startup_compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -98,10 +95,8 @@ deployment_groups: modules: - id: cluster-network source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: - cluster-network settings: @@ -109,7 +104,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - cluster-network - homefs @@ -124,7 +118,6 @@ deployment_groups: project: $(vars.project_id) - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - cluster-network - homefs @@ -137,7 +130,6 @@ deployment_groups: machine_type: c2-standard-60 - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - cluster-network - compute_partition @@ -150,7 +142,6 @@ deployment_groups: controller_machine_type: c2-standard-4 - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - cluster-network - slurm_controller diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index 9af3a6712b..655eb1a1f8 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -35,12 +35,10 @@ deployment_groups: ## Network - id: network source: modules/network/pre-existing-vpc - kind: terraform ## File Systems - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: local_mount: "/home" @@ -48,7 +46,6 @@ deployment_groups: ## Installation Scripts - id: omnia source: community/modules/scripts/omnia-install - kind: terraform outputs: [inventory_file, omnia_user_warning] settings: manager_ips: [localhost] @@ -56,7 +53,6 @@ deployment_groups: - id: startup-manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -70,7 +66,6 @@ deployment_groups: - id: startup-compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -83,7 +78,6 @@ deployment_groups: ## Compute - id: manager source: modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -94,7 +88,6 @@ deployment_groups: - id: compute source: modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -106,6 +99,5 @@ deployment_groups: # This module simply makes terraform wait until the startup script is complete - id: wait source: community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.manager.name[0])) diff --git a/community/examples/slurm-gcp-v5-hpc-centos7.yaml b/community/examples/slurm-gcp-v5-hpc-centos7.yaml index e913af43f7..14965bece7 100644 --- a/community/examples/slurm-gcp-v5-hpc-centos7.yaml +++ b/community/examples/slurm-gcp-v5-hpc-centos7.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -57,7 +54,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -67,7 +63,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -76,7 +71,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/community/examples/slurm-gcp-v5-ubuntu2004.yaml b/community/examples/slurm-gcp-v5-ubuntu2004.yaml index f42e707147..956ab40270 100644 --- a/community/examples/slurm-gcp-v5-ubuntu2004.yaml +++ b/community/examples/slurm-gcp-v5-ubuntu2004.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -57,7 +54,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -67,7 +63,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -76,7 +71,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/community/examples/spack-gromacs.yaml b/community/examples/spack-gromacs.yaml index ef499f81e9..d5cd2c062a 100644 --- a/community/examples/spack-gromacs.yaml +++ b/community/examples/spack-gromacs.yaml @@ -30,19 +30,16 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform ## Filesystems - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /sw - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -50,7 +47,6 @@ deployment_groups: ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /sw/spack spack_url: https://github.com/spack/spack @@ -85,7 +81,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -96,7 +91,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -107,7 +101,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -118,7 +111,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md index 6b9d45500e..3f8d3f5bd4 100644 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md +++ b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md @@ -20,7 +20,6 @@ The following code snippet creates a partition module with: ```yaml - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [network1, homefs] settings: max_node_count: 200 diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index a54a4e3234..7476b5c1a6 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -24,7 +24,6 @@ a startup script and network created in previous steps. ```yaml - id: htcondor_execute_point source: community/modules/compute/htcondor-execute-point - kind: terraform use: - network1 - htcondor_configure_execute_point diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index d977d584fd..7b069c54f4 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -20,7 +20,6 @@ The following code snippet creates a partition module with: ```yaml - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index f19cf1353a..7ca93be525 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -12,7 +12,6 @@ accounting data storage. ```yaml - id: project source: community/modules/database/cloudsql-federation - kind: terraform use: [network1] settings: sql_instance_name: slurm-sql6-demo diff --git a/community/modules/file-system/Intel-DAOS/README.md b/community/modules/file-system/Intel-DAOS/README.md index e71614a1f7..a2ebbca1f0 100644 --- a/community/modules/file-system/Intel-DAOS/README.md +++ b/community/modules/file-system/Intel-DAOS/README.md @@ -23,7 +23,6 @@ For example, in the following snippet taken from the [community/example/intel/da ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: number_of_instances: 2 @@ -44,7 +43,6 @@ The following settings will configure this [system for TCO](https://github.com/d ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} @@ -60,7 +58,6 @@ The following settings will configure this system for [best performance](https:/ ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} diff --git a/community/modules/file-system/nfs-server/README.md b/community/modules/file-system/nfs-server/README.md index cffc4d840c..671e398f8e 100644 --- a/community/modules/file-system/nfs-server/README.md +++ b/community/modules/file-system/nfs-server/README.md @@ -17,7 +17,6 @@ community modules that create compute VMs. ```yaml - id: homefs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true diff --git a/community/modules/project/new-project/README.md b/community/modules/project/new-project/README.md index 6301b04d3f..c7fad43517 100644 --- a/community/modules/project/new-project/README.md +++ b/community/modules/project/new-project/README.md @@ -11,7 +11,6 @@ This module is meant for use with Terraform 0.13. ```yaml - id: project source: community/modules/project/new-project - kind: terraform settings: project_id: test_project folder_id: 334688113020 # random number diff --git a/community/modules/project/service-account/README.md b/community/modules/project/service-account/README.md index cf9331f2d2..79b661784c 100644 --- a/community/modules/project/service-account/README.md +++ b/community/modules/project/service-account/README.md @@ -7,7 +7,6 @@ Allows creation of service accounts for a Google Cloud Platform project. ```yaml - id: service_acct source: community/modules/project/service-account - kind: terraform settings: - project_id: $(vars.project_id) - names: [ "instance_acct" ] diff --git a/community/modules/project/service-enablement/README.md b/community/modules/project/service-enablement/README.md index f03091a28b..266eac26ec 100644 --- a/community/modules/project/service-enablement/README.md +++ b/community/modules/project/service-enablement/README.md @@ -7,7 +7,6 @@ Allows management of multiple API services for a Google Cloud Platform project. ```yaml - id: services-api source: community/modules/project/service-enablement - kind: terraform settings: gcp_service_list: [ "file.googleapis.com", diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md index 6d2ac49cd9..f606e4dac5 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md @@ -19,7 +19,6 @@ controller for optimal performance at different scales. ```yaml - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md index ba18a7adf4..2f6cb2f13a 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md @@ -16,7 +16,6 @@ node is used in conjunction with the ```yaml - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/modules/scheduler/cloud-batch-job/README.md b/community/modules/scheduler/cloud-batch-job/README.md index 5b5328b16a..21dab0e068 100644 --- a/community/modules/scheduler/cloud-batch-job/README.md +++ b/community/modules/scheduler/cloud-batch-job/README.md @@ -17,7 +17,6 @@ job unless one is provided. See the ```yaml - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform use: [network1] settings: runnable: "echo 'hello world'" @@ -53,22 +52,18 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: appfs source: modules/file-system/filestore - kind: terraform use: [network1] - id: batch-startup-script source: modules/scripts/startup-script - kind: terraform settings: runners: ... - id: batch-compute-template source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v7.8.0 - kind: terraform use: [batch-startup-script] settings: # Boiler plate to work with Cloud Foundation Toolkit @@ -84,7 +79,6 @@ deployment_groups: - id: batch-job source: ./community/modules/scheduler/cloud-batch-job - kind: terraform settings: instance_template: $(batch-compute-template.self_link) outputs: [instructions] diff --git a/community/modules/scheduler/cloud-batch-login-node/README.md b/community/modules/scheduler/cloud-batch-login-node/README.md index 4f98e24adb..80c214e954 100644 --- a/community/modules/scheduler/cloud-batch-login-node/README.md +++ b/community/modules/scheduler/cloud-batch-login-node/README.md @@ -20,12 +20,10 @@ systems and test installed software before submitting a Google Cloud Batch job. ```yaml - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform ... - id: batch-login source: community/modules/scheduler/cloud-batch-login-node - kind: terraform use: [batch-job] outputs: [instructions] ``` diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-configure/README.md index 5e36ea81e7..4ae216b3bc 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-configure/README.md @@ -26,11 +26,9 @@ install the HTCondor software and adds custom configurations using ```yaml - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -41,7 +39,6 @@ install the HTCondor software and adds custom configurations using - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 82f86e7a55..85c36e1a5e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -30,7 +30,6 @@ controller for optimal performance at different scales. ```yaml - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - homefs diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index c4a479d296..e88b12ad19 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -64,7 +64,6 @@ The hybrid module can be added to a blueprint as follows: ```yaml - id: slurm-controller source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - kind: terraform use: - debug-partition - compute-partition diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index e5d75fb538..0efc8a5f00 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -14,7 +14,6 @@ terraform modules. The login node is used in conjunction with the ```yaml - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 80c085d6ff..8f37e411d1 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -23,11 +23,9 @@ install the HTCondor software and adds custom configurations using ```yaml - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -38,7 +36,6 @@ install the HTCondor software and adds custom configurations using - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 8c6c1c539b..65ec73178d 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -32,7 +32,6 @@ see this module used in a full blueprint, see the [spack-gromacs.yaml] example. ```yaml - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /sw/spack spack_url: https://github.com/spack/spack @@ -97,7 +96,6 @@ deployment via the following: ```yaml - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [spack] settings: subnetwork_name: ((module.network1.primary_subnetwork.name)) @@ -111,7 +109,6 @@ Alternatively, it can be added as a startup script via: ```yaml - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - $(spack.install_spack_deps_runner) diff --git a/community/modules/scripts/wait-for-startup/README.md b/community/modules/scripts/wait-for-startup/README.md index 705699f8b7..aa1a3b408b 100644 --- a/community/modules/scripts/wait-for-startup/README.md +++ b/community/modules/scripts/wait-for-startup/README.md @@ -17,7 +17,6 @@ up a node. ```yaml - id: wait source: community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.workstation.name[0])) ``` diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index d69cbc1f06..56da1f3bc1 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -28,16 +28,13 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -99,7 +96,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -143,7 +139,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -152,7 +147,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -162,7 +156,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml index c1ea22ae51..dfe2a9f276 100644 --- a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml +++ b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup-controller source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +47,6 @@ deployment_groups: - id: startup-compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -63,7 +59,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -77,7 +72,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -87,7 +81,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -101,7 +94,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 117c790b77..ed1de2ce09 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -28,16 +28,13 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -106,7 +103,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -154,7 +150,6 @@ deployment_groups: mpirun -n 60 -npernode 30 -hostfile hostfile simpleFoam -parallel - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -163,7 +158,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -173,7 +167,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index bfe67504b3..d37cfd80a0 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -28,16 +28,13 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -99,7 +96,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -141,7 +137,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -150,7 +145,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -160,7 +154,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/examples/README.md b/examples/README.md index e9a74af013..d1ef417c33 100644 --- a/examples/README.md +++ b/examples/README.md @@ -588,7 +588,7 @@ deployment_groups: # Local source, prefixed with ./ (/ and ../ also accepted) - id: # Required: Name of this module used to uniquely identify it. source: ./modules/role/module-name # Required: Points to the module directory. - kind: < terraform | packer > # Required: Type of module, currently choose from terraform or packer. + kind: < terraform | packer > # Optional: Type of module, currently choose from terraform or packer. If not specified, `kind` will default to `terraform` # Optional: All configured settings for the module. For terraform, each # variable listed in variables.tf can be set here, and are mandatory if no # default was provided and are not defined elsewhere (like the top-level vars) diff --git a/examples/hpc-cluster-high-io.yaml b/examples/hpc-cluster-high-io.yaml index fd7b82a552..05f2f3a630 100644 --- a/examples/hpc-cluster-high-io.yaml +++ b/examples/hpc-cluster-high-io.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: projectsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: filestore_tier: HIGH_SCALE_SSD @@ -53,14 +50,12 @@ deployment_groups: - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform use: [network1] settings: local_mount: /scratch - id: low_cost_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -76,7 +71,6 @@ deployment_groups: # This compute_partition is far more performant than low_cost_partition. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -88,7 +82,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -102,7 +95,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs @@ -114,5 +106,4 @@ deployment_groups: - id: hpc_dashboard source: modules/monitoring/dashboard - kind: terraform outputs: [instructions] diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 6893c612cf..38412d2125 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -32,10 +32,8 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: scripts_for_image source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -60,10 +58,8 @@ deployment_groups: modules: - id: cluster-network source: modules/network/pre-existing-vpc - kind: terraform - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [cluster-network] settings: partition_name: compute @@ -73,7 +69,6 @@ deployment_groups: project: $(vars.project_id) - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [cluster-network, compute_partition] settings: login_node_count: 1 @@ -82,7 +77,6 @@ deployment_groups: project: $(vars.project_id) - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: [cluster-network, slurm_controller] settings: instance_image: diff --git a/modules/README.md b/modules/README.md index ffeec77612..2cd9f0263b 100644 --- a/modules/README.md +++ b/modules/README.md @@ -200,7 +200,6 @@ example, the following code is using the embedded pre-existing-vpc module: ```yaml - id: network1 source: modules/network/pre-existing-vpc - kind: terraform ``` #### Local Modules @@ -213,7 +212,6 @@ following module definition refers the local pre-existing-vpc modules. ```yaml - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform ``` > **_NOTE:_** This example would have to be run from the HPC Toolkit repository @@ -232,7 +230,6 @@ Get module from GitHub over SSH: ```yaml - id: network1 source: git@github.com:GoogleCloudPlatform/hpc-toolkit.git//modules/network/vpc - kind: terraform ``` Get module from GitHub over HTTPS: @@ -240,7 +237,6 @@ Get module from GitHub over HTTPS: ```yaml - id: network1 source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc - kind: terraform ``` Both examples above use the [double-slash notation][tfsubdir] (`//`) to indicate @@ -256,7 +252,6 @@ Toolkit vpc module, use: ```yaml - id: network1 source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=develop - kind: terraform ``` [tfrev]: https://www.terraform.io/language/modules/sources#selecting-a-revision @@ -294,11 +289,9 @@ the used module's output. For example, see the following blueprint snippet: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: workstation source: modules/compute/vm-instance - kind: terraform use: [network1] settings: ... diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 34b390d1f9..9c0e777b44 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -8,7 +8,6 @@ This module creates one or more ```yaml - id: compute source: modules/compute/vm-instance - kind: terraform use: [network1] settings: instance_count: 8 diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index 5eddd6d6d2..e3359e2d0f 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -48,7 +48,6 @@ The Filestore instance defined below will have the following attributes: ```yaml - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -67,7 +66,6 @@ The Filestore instance defined below will have the following attributes: ```yaml - id: highscale source: modules/file-system/filestore - kind: terraform use: [network1] settings: filestore_tier: HIGH_SCALE_SSD diff --git a/modules/file-system/pre-existing-network-storage/README.md b/modules/file-system/pre-existing-network-storage/README.md index 338af4caad..3cf6faaab7 100644 --- a/modules/file-system/pre-existing-network-storage/README.md +++ b/modules/file-system/pre-existing-network-storage/README.md @@ -13,7 +13,6 @@ Toolkit supported file-system such as [filestore](../filestore/README.md). ```yaml - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: ## Set server IP here ## remote_mount: nfsshare diff --git a/modules/monitoring/dashboard/README.md b/modules/monitoring/dashboard/README.md index 920ef99e0c..6e8d5fe18e 100644 --- a/modules/monitoring/dashboard/README.md +++ b/modules/monitoring/dashboard/README.md @@ -12,7 +12,6 @@ needed. ```yaml - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform settings: widgets: - | diff --git a/modules/network/pre-existing-vpc/README.md b/modules/network/pre-existing-vpc/README.md index c6e64a7e19..62e421b3ba 100644 --- a/modules/network/pre-existing-vpc/README.md +++ b/modules/network/pre-existing-vpc/README.md @@ -14,7 +14,6 @@ sharing a single network module between deployment groups. ```yaml - id: network1 source: modules/network/pre-existing-vpc - kind: terraform settings: - project_id: $(vars.project_id) ``` diff --git a/modules/network/vpc/README.md b/modules/network/vpc/README.md index 9133a531da..6efa14b4e2 100644 --- a/modules/network/vpc/README.md +++ b/modules/network/vpc/README.md @@ -108,7 +108,6 @@ compact set of subnetworks possible. ```yaml - id: network1 source: modules/network/vpc - kind: terraform settings: - deployment_name: $(vars.deployment_name) ``` diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index f20b98739c..d8ce771ce8 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -141,7 +141,6 @@ sudo journalctl -u google-startup-scripts.service ```yaml - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -169,7 +168,6 @@ sudo journalctl -u google-startup-scripts.service - id: compute-cluster source: ./modules/compute/vm-instance - kind: terraform use: [homefs, startup] ``` diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml index f3d0bfb097..d7aa6f1f1f 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml @@ -30,13 +30,11 @@ deployment_groups: # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/vpc - kind: terraform settings: network_name: lustre-new-vpc - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -44,7 +42,6 @@ deployment_groups: # Explicitly picking the local version of the module - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform settings: local_mount: /scratch network_self_link: $(network1.network_self_link) @@ -54,7 +51,6 @@ deployment_groups: # Create a separate workstation to catch regressions in vm-instance - id: workstation source: ./modules/compute/vm-instance - kind: terraform use: - network1 - homefs @@ -65,7 +61,6 @@ deployment_groups: - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -76,7 +71,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -85,7 +79,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml index 5c47f9d102..3e0d367dcb 100644 --- a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml +++ b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml @@ -27,13 +27,11 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform settings: network_name: monitoring-net - id: homefs source: community/modules/file-system/nfs-server - kind: terraform use: [network] settings: local_mounts: [/home] @@ -41,7 +39,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -55,7 +52,6 @@ deployment_groups: - id: workstation source: ./modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -67,6 +63,5 @@ deployment_groups: - id: hpc-dash source: ./modules/monitoring/dashboard - kind: terraform settings: title: $(vars.deployment_name) diff --git a/tools/validate_configs/test_configs/2-nfs-servers.yaml b/tools/validate_configs/test_configs/2-nfs-servers.yaml index 26ed5cba0c..99d7613a3b 100644 --- a/tools/validate_configs/test_configs/2-nfs-servers.yaml +++ b/tools/validate_configs/test_configs/2-nfs-servers.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] outputs: [network_storage] settings: @@ -40,7 +38,6 @@ deployment_groups: - id: appsfs source: ./community/modules/file-system/nfs-server - kind: terraform use: [network1] outputs: [network_storage] settings: diff --git a/tools/validate_configs/test_configs/2filestore-4instances.yaml b/tools/validate_configs/test_configs/2filestore-4instances.yaml index a1fe2c1291..edaa6c48ae 100644 --- a/tools/validate_configs/test_configs/2filestore-4instances.yaml +++ b/tools/validate_configs/test_configs/2filestore-4instances.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network source: ./modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -41,7 +39,6 @@ deployment_groups: - id: apps source: ./modules/file-system/filestore - kind: terraform use: [network] settings: name: apps @@ -51,7 +48,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -66,7 +62,6 @@ deployment_groups: - id: license-server-1 source: ./modules/compute/vm-instance - kind: terraform use: [network] settings: name_prefix: ls1 @@ -76,7 +71,6 @@ deployment_groups: - id: license-server-2 source: modules/compute/vm-instance - kind: terraform use: [network] settings: name_prefix: ls2 @@ -86,7 +80,6 @@ deployment_groups: - id: head-node source: modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -101,7 +94,6 @@ deployment_groups: - id: compute source: modules/compute/vm-instance - kind: terraform use: - network - homefs diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index be1e56120e..81aaf77acc 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,14 +37,12 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -58,7 +54,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -92,7 +87,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml index d3a0ee919f..782ad34854 100644 --- a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml +++ b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml @@ -25,17 +25,14 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: appfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: {local_mount: /sw} - id: batch-startup-script source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -51,7 +48,6 @@ deployment_groups: - id: batch-compute-template source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v7.8.0 - kind: terraform use: [batch-startup-script] settings: # Boiler plate to work with Cloud Foundation Toolkit @@ -67,7 +63,6 @@ deployment_groups: - id: batch-job source: ./community/modules/scheduler/cloud-batch-job - kind: terraform use: [network1, appfs, batch-startup-script] settings: runnable: "cat /sw/hello.txt" diff --git a/tools/validate_configs/test_configs/complex-data.yaml b/tools/validate_configs/test_configs/complex-data.yaml index 2421496c7b..3007047f80 100644 --- a/tools/validate_configs/test_configs/complex-data.yaml +++ b/tools/validate_configs/test_configs/complex-data.yaml @@ -43,13 +43,11 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform settings: network_name: "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -61,7 +59,6 @@ deployment_groups: - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -76,7 +73,6 @@ deployment_groups: - id: license-server-1 source: modules/compute/vm-instance - kind: terraform use: [network] settings: name_prefix: ls1 diff --git a/tools/validate_configs/test_configs/dashboards.yaml b/tools/validate_configs/test_configs/dashboards.yaml index 42d587cbb7..1e4a8b80f2 100644 --- a/tools/validate_configs/test_configs/dashboards.yaml +++ b/tools/validate_configs/test_configs/dashboards.yaml @@ -27,7 +27,6 @@ deployment_groups: modules: - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform settings: widgets: - | @@ -48,7 +47,6 @@ deployment_groups: } - id: empty_dash source: modules/monitoring/dashboard - kind: terraform settings: base_dashboard: Empty widgets: diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index 8f3a622e0b..2a72fea52e 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,14 +37,12 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -58,7 +54,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -92,7 +87,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml index b242a60aec..3215ab4e1c 100644 --- a/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml +++ b/tools/validate_configs/test_configs/exascaler-existing-vpc.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform use: [network1] settings: local_mount: /scratch diff --git a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml index 86b46909fd..936ab51aa1 100644 --- a/tools/validate_configs/test_configs/exascaler-new-vpc.yaml +++ b/tools/validate_configs/test_configs/exascaler-new-vpc.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: scratchfs source: ./community/modules/file-system/DDN-EXAScaler - kind: terraform use: [network1] settings: local_mount: /scratch diff --git a/tools/validate_configs/test_configs/gpu.yaml b/tools/validate_configs/test_configs/gpu.yaml index 790e0de517..874eb686ea 100644 --- a/tools/validate_configs/test_configs/gpu.yaml +++ b/tools/validate_configs/test_configs/gpu.yaml @@ -30,11 +30,9 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: workstation source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index 4e550f8e80..8dabe6d805 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,14 +37,12 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -58,7 +54,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -92,7 +87,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml b/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml index 6615ce89df..1b28a603a1 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml @@ -33,18 +33,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: projectsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: filestore_tier: HIGH_SCALE_SSD @@ -53,14 +50,12 @@ deployment_groups: - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform use: [network1] settings: local_mount: /scratch - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - scratchfs @@ -72,7 +67,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - scratchfs @@ -82,7 +76,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - homefs - scratchfs diff --git a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml index ee9b5c544f..205b7ee063 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml @@ -29,14 +29,12 @@ deployment_groups: - group: primary modules: - source: modules/network/pre-existing-vpc - kind: terraform id: network1 settings: network_name: cloud-vpc-network subnetwork_name: primary-subnet - source: modules/file-system/pre-existing-network-storage - kind: terraform id: pre-existing-storage outputs: - network_storage @@ -47,7 +45,6 @@ deployment_groups: fs_type: nfs - source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform id: compute-partition use: [network1] settings: @@ -59,7 +56,6 @@ deployment_groups: Default: NO - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - kind: terraform id: slurm-controller use: [compute-partition, pre-existing-storage] settings: diff --git a/tools/validate_configs/test_configs/hpc-cluster-project.yaml b/tools/validate_configs/test_configs/hpc-cluster-project.yaml index bf420d690e..2feb7ddff5 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-project.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-project.yaml @@ -34,7 +34,6 @@ deployment_groups: modules: - id: project source: ./community/modules/project/new-project - kind: terraform settings: project_id: $(vars.project_id) folder_id: 334688113020 # random number @@ -43,7 +42,6 @@ deployment_groups: - id: enable-apis source: ./community/modules/project/service-enablement - kind: terraform use: [project] settings: gcp_service_list: @@ -57,18 +55,15 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -80,7 +75,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -90,7 +84,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml index 4e1068c02b..da8ab09d1b 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: '$controller' remote_mount: /home @@ -40,7 +38,6 @@ deployment_groups: - id: service_acct source: ./community/modules/project/service-account - kind: terraform settings: project_id: $(vars.project_id) names: @@ -51,7 +48,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [network1] settings: partition_name: compute @@ -60,7 +56,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [network1] settings: network_storage: diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml index 1e9470e3ec..9c3015c9a8 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: ./community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: labels: @@ -39,7 +37,6 @@ deployment_groups: - id: slurm-sql source: ./community/modules/database/slurm-cloudsql-federation - kind: terraform use: [network1] settings: sql_instance_name: slurm-sql8 @@ -47,7 +44,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -58,7 +54,6 @@ deployment_groups: - id: slurm-controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition @@ -71,7 +66,6 @@ deployment_groups: - id: slurm-login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - slurm-controller - network1 diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple.yaml index bcf2b053b7..e55ac954de 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -40,7 +38,6 @@ deployment_groups: - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -55,7 +52,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml index ae022dbb7a..c77bdd1672 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml @@ -30,18 +30,15 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +47,6 @@ deployment_groups: - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -63,7 +59,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -75,7 +70,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml index aba6e5b910..e1170101eb 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml @@ -31,18 +31,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -55,7 +52,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -65,7 +61,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -74,7 +69,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/tools/validate_configs/test_configs/htcondor-pool.yaml b/tools/validate_configs/test_configs/htcondor-pool.yaml index f61e2c56de..24a7e5715b 100644 --- a/tools/validate_configs/test_configs/htcondor-pool.yaml +++ b/tools/validate_configs/test_configs/htcondor-pool.yaml @@ -29,23 +29,19 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform outputs: - network_name - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_services source: community/modules/project/service-enablement - kind: terraform use: - htcondor_install - id: htcondor_install_scripts source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -67,29 +63,24 @@ deployment_groups: modules: - id: cluster_network source: modules/network/pre-existing-vpc - kind: terraform - id: htcondor_configure source: community/modules/scheduler/htcondor-configure - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - $(htcondor_configure.central_manager_runner) - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - $(htcondor_configure.access_point_runner) - id: htcondor_cm source: modules/compute/vm-instance - kind: terraform use: - cluster_network - htcondor_configure_central_manager @@ -109,7 +100,6 @@ deployment_groups: - id: htcondor_access source: modules/compute/vm-instance - kind: terraform use: - cluster_network - htcondor_configure_access_point diff --git a/tools/validate_configs/test_configs/instance-with-startup.yaml b/tools/validate_configs/test_configs/instance-with-startup.yaml index 3b13ca6e4a..b2c8d7732a 100644 --- a/tools/validate_configs/test_configs/instance-with-startup.yaml +++ b/tools/validate_configs/test_configs/instance-with-startup.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -53,7 +50,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs @@ -64,6 +60,5 @@ deployment_groups: - id: wait source: ./community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.workstation.name[0])) diff --git a/tools/validate_configs/test_configs/label_test.yaml b/tools/validate_configs/test_configs/label_test.yaml index b9777b2bbc..f64f9739c6 100644 --- a/tools/validate_configs/test_configs/label_test.yaml +++ b/tools/validate_configs/test_configs/label_test.yaml @@ -30,11 +30,9 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -46,7 +44,6 @@ deployment_groups: - id: homefs1 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs diff --git a/tools/validate_configs/test_configs/new_project.yaml b/tools/validate_configs/test_configs/new_project.yaml index 06563e5b8a..c62b9d3984 100644 --- a/tools/validate_configs/test_configs/new_project.yaml +++ b/tools/validate_configs/test_configs/new_project.yaml @@ -24,7 +24,6 @@ deployment_groups: modules: - id: project source: ./community/modules/project/new-project - kind: terraform settings: project_id: test_project folder_id: 334688113020 # random number diff --git a/tools/validate_configs/test_configs/overwrite_labels.yaml b/tools/validate_configs/test_configs/overwrite_labels.yaml index 3d4b724bc4..f885a60bf4 100644 --- a/tools/validate_configs/test_configs/overwrite_labels.yaml +++ b/tools/validate_configs/test_configs/overwrite_labels.yaml @@ -31,11 +31,9 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -46,7 +44,6 @@ deployment_groups: - id: homefs1 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -57,7 +54,6 @@ deployment_groups: - id: homefs2 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -68,7 +64,6 @@ deployment_groups: - id: homefs3 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs diff --git a/tools/validate_configs/test_configs/packer.yaml b/tools/validate_configs/test_configs/packer.yaml index 24af11c96d..2b13cdad3d 100644 --- a/tools/validate_configs/test_configs/packer.yaml +++ b/tools/validate_configs/test_configs/packer.yaml @@ -29,7 +29,6 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - group: packer modules: - id: my-custom-image diff --git a/tools/validate_configs/test_configs/pre-existing-fs.yaml b/tools/validate_configs/test_configs/pre-existing-fs.yaml index 6a89e74840..bc3290fcc1 100644 --- a/tools/validate_configs/test_configs/pre-existing-fs.yaml +++ b/tools/validate_configs/test_configs/pre-existing-fs.yaml @@ -31,17 +31,14 @@ deployment_groups: # network-name from deployment vars - id: homefs-filestore source: modules/file-system/filestore - kind: terraform - group: compute modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: "" # for now, must be completed manually in compute/main.tf remote_mount: nfsshare @@ -50,7 +47,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -59,7 +55,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index b06679749a..5b644dc3e6 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,7 +37,6 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: image: rocky-linux-cloud/rocky-linux-8 @@ -47,7 +44,6 @@ deployment_groups: - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -60,7 +56,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -94,7 +89,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/simple-startup.yaml b/tools/validate_configs/test_configs/simple-startup.yaml index 3940714717..97b48176bb 100644 --- a/tools/validate_configs/test_configs/simple-startup.yaml +++ b/tools/validate_configs/test_configs/simple-startup.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,13 +48,11 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup] settings: machine_type: e2-standard-4 - id: waiter source: ./community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.instance.name[0])) diff --git a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml index 4f36bb0dde..2aa484bef8 100644 --- a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml +++ b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -53,7 +50,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs @@ -65,7 +61,6 @@ deployment_groups: - id: compute-partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -74,7 +69,6 @@ deployment_groups: - id: debug-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -83,7 +77,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index b8322d2c94..572194a0d9 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: spack source: ./community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -55,7 +53,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: data @@ -82,7 +79,6 @@ deployment_groups: - id: spack-build source: modules/compute/vm-instance - kind: terraform use: - network1 - spack-startup diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index ffe5ece48b..2fd2f4ec41 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: spack source: ./community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -83,7 +81,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: data @@ -110,7 +107,6 @@ deployment_groups: - id: spack-build source: modules/compute/vm-instance - kind: terraform use: - network1 - spack-startup diff --git a/tools/validate_configs/test_configs/startup-options.yaml b/tools/validate_configs/test_configs/startup-options.yaml index cbfe2764e3..94a22d5ae1 100644 --- a/tools/validate_configs/test_configs/startup-options.yaml +++ b/tools/validate_configs/test_configs/startup-options.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +48,6 @@ deployment_groups: - id: instance-explicit-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: explicit @@ -59,7 +56,6 @@ deployment_groups: - id: instance-no-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: no-startup @@ -67,7 +63,6 @@ deployment_groups: - id: instance-use-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup] settings: name_prefix: use-startup @@ -76,7 +71,6 @@ deployment_groups: - id: instance-metadata-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: metadata-startup diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 103f038974..cf77ddc4b0 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -27,13 +27,11 @@ deployment_groups: modules: - id: instance source: modules/compute/vm-instance - kind: terraform outputs: - name - id: sql source: community/modules/database/slurm-cloudsql-federation - kind: terraform outputs: - cloudsql settings: @@ -44,7 +42,6 @@ deployment_groups: - id: filestore source: modules/file-system/filestore - kind: terraform use: [vpc] outputs: - network_storage @@ -52,14 +49,12 @@ deployment_groups: - id: nfs source: ./community/modules/file-system/nfs-server - kind: terraform outputs: - network_storage - install_nfs_client - id: pre-existing-storage source: modules/file-system/pre-existing-network-storage - kind: terraform outputs: - network_storage settings: @@ -70,7 +65,6 @@ deployment_groups: - id: pre-existing-vpc source: modules/network/pre-existing-vpc - kind: terraform outputs: - network_name - network_self_link @@ -81,7 +75,6 @@ deployment_groups: - id: vpc source: modules/network/vpc - kind: terraform outputs: - network_name - network_self_link @@ -93,7 +86,6 @@ deployment_groups: - id: new-project source: community/modules/project/new-project - kind: terraform outputs: - project_name - project_id @@ -119,7 +111,6 @@ deployment_groups: - id: sa source: community/modules/project/service-account - kind: terraform outputs: - email - emails @@ -140,20 +131,17 @@ deployment_groups: - id: spack source: community/modules/scripts/spack-install - kind: terraform outputs: - startup_script - controller_startup_script - id: startup source: modules/scripts/startup-script - kind: terraform outputs: - startup_script - id: partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [vpc] outputs: - partition @@ -162,7 +150,6 @@ deployment_groups: - id: lustre source: ./community/modules/file-system/DDN-EXAScaler - kind: terraform outputs: - private_addresses - ssh_console @@ -172,7 +159,6 @@ deployment_groups: - id: controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - partition - vpc diff --git a/tools/validate_configs/test_configs/threads_per_core.yaml b/tools/validate_configs/test_configs/threads_per_core.yaml index de06cab879..a06b86feac 100644 --- a/tools/validate_configs/test_configs/threads_per_core.yaml +++ b/tools/validate_configs/test_configs/threads_per_core.yaml @@ -30,11 +30,9 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: n1-2-threads source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -44,7 +42,6 @@ deployment_groups: - id: n1-1-thread source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -54,7 +51,6 @@ deployment_groups: - id: n1-0-threads source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -64,7 +60,6 @@ deployment_groups: - id: n1-null-threads source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -74,7 +69,6 @@ deployment_groups: - id: n2-2-threads source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -84,7 +78,6 @@ deployment_groups: - id: n2-1-thread source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -94,7 +87,6 @@ deployment_groups: - id: c2-2-threads source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -104,7 +96,6 @@ deployment_groups: - id: c2-1-thread source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -114,7 +105,6 @@ deployment_groups: - id: e2-medium-0-thread source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: @@ -124,7 +114,6 @@ deployment_groups: - id: e2-medium-null-thread source: ./modules/compute/vm-instance - kind: terraform use: - network1 settings: diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index b2cf676059..335a9b9b45 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,14 +37,12 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -64,7 +60,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -98,7 +93,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/use-resources.yaml b/tools/validate_configs/test_configs/use-resources.yaml index 5ef30961e9..bf67645d9e 100644 --- a/tools/validate_configs/test_configs/use-resources.yaml +++ b/tools/validate_configs/test_configs/use-resources.yaml @@ -30,11 +30,9 @@ deployment_groups: # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -43,12 +41,10 @@ deployment_groups: - id: projectsfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform settings: local_mount: /scratch network_self_link: $(network1.network_self_link) @@ -57,7 +53,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - scratchfs @@ -68,7 +63,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - projectsfs - compute_partition @@ -76,7 +70,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - homefs - scratchfs diff --git a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml index 3985da6323..7adcc33496 100644 --- a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml +++ b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: multi-instance-multi-ssd source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: machine_type: n2-standard-16 @@ -40,7 +38,6 @@ deployment_groups: - id: instance-ssd-interface-defined source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: machine_type: n2-standard-16 From 1ff7ecd53cb961a2c9f8af4dae19153f31a7a13a Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 13 Sep 2022 15:33:07 -0700 Subject: [PATCH 11/51] Add interactive argument to qsim script to make conda accessable --- community/examples/quantum-circuit-simulator.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/examples/quantum-circuit-simulator.yaml b/community/examples/quantum-circuit-simulator.yaml index a23b897b66..736021a350 100644 --- a/community/examples/quantum-circuit-simulator.yaml +++ b/community/examples/quantum-circuit-simulator.yaml @@ -113,7 +113,8 @@ deployment_groups: - type: shell destination: run-qsim.sh content: | - #!/bin/bash + #!/bin/bash -i + # The -i above (for interactive) is required so that conda command will be accessible. # this script demonstrates how to run the qsim example application and # also "warms up" the GPU to give reliable performance metrics conda activate qsim From 75a28b0eb7fd66e58e00dbfbf165e642a715f444 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 13 Sep 2022 18:58:22 -0700 Subject: [PATCH 12/51] Add troubleshooting for Slurm: network is unreachable --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 166764ad33..c48aef644c 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,59 @@ In the right side, expand the Filters view and then filter by label, specifying ## Troubleshooting +### Network is unreachable (Slurm V5) + +Slurm requires access to google APIs to function. This can be achieved through one of the following methods: + +1. Create a [Cloud NAT](https://cloud.google.com/nat) (preferred). +2. Setting `disable_controller_public_ips: false` & + `disable_login_public_ips: false` on the controller and login nodes + respectively. +3. Enable + [private access to Google APIs](https://cloud.google.com/vpc/docs/private-access-options). + +By default the Toolkit VPC module will create an associated Cloud NAT so this is +typically seen when working with the pre-existing-vpc module. If no access +exists you will see the following errors: + +When you ssh into the login node or controller you will see the following +message: + +```text +*** Slurm setup failed! Please view log: /slurm/scripts/setup.log *** +``` + +> **_NOTE:_**: Many different potential issues could be indicated by the above +> message, so be sure to verify issue in logs. + +To confirm the issue, ssh onto the controller and call `sudo cat /slurm/scripts/setup.log`. Look for +the following logs: + +```text +google_metadata_script_runner: startup-script: ERROR: [Errno 101] Network is unreachable +google_metadata_script_runner: startup-script: OSError: [Errno 101] Network is unreachable +google_metadata_script_runner: startup-script: ERROR: Aborting setup... +google_metadata_script_runner: startup-script exit status 0 +google_metadata_script_runner: Finished running startup scripts. +``` + +You may also notice mount failure logs on the login node: + +```text +INFO: Waiting for '/usr/local/etc/slurm' to be mounted... +INFO: Waiting for '/home' to be mounted... +INFO: Waiting for '/opt/apps' to be mounted... +INFO: Waiting for '/etc/munge' to be mounted... +ERROR: mount of path '/usr/local/etc/slurm' failed: : Command '['mount', '/usr/local/etc/slurm']' returned non-zero exit status 32. +ERROR: mount of path '/opt/apps' failed: : Command '['mount', '/opt/apps']' returned non-zero exit status 32. +ERROR: mount of path '/home' failed: : Command '['mount', '/home']' returned non-zero exit status 32. +ERROR: mount of path '/etc/munge' failed: : Command '['mount', '/etc/munge']' returned non-zero exit status 32. +``` + +> **_NOTE:_**: The above logs only indicate that something went wrong with the +> startup of the controller. Check logs on the controller to be sure it is a +> network issue. + ### Failure to Create Auto Scale Nodes (Slurm) If your deployment succeeds but your jobs fail with the following error: From cc763155d6557d8667242f444c2cd4ea508f16e8 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 13 Sep 2022 15:45:04 -0700 Subject: [PATCH 13/51] Add auto-delete boot disk as an option on vm-instance --- modules/compute/vm-instance/README.md | 1 + modules/compute/vm-instance/main.tf | 2 +- modules/compute/vm-instance/variables.tf | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 34b390d1f9..888c9d7b65 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -134,6 +134,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used to name the cluster | `string` | n/a | yes | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to true, instances will not have public IPs | `bool` | `false` | no | diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 213b453e5c..f00d8f8f89 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -112,7 +112,7 @@ resource "google_compute_instance" "compute_vm" { boot_disk { source = google_compute_disk.boot_disk[count.index].self_link device_name = google_compute_disk.boot_disk[count.index].name - auto_delete = true + auto_delete = var.auto_delete_boot_disk } dynamic "scratch_disk" { diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index d5e5126e2c..57539aae8c 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -49,6 +49,12 @@ variable "disk_type" { default = "pd-standard" } +variable "auto_delete_boot_disk" { + description = "Controls if boot disk should be auto-deleted when instance is deleted." + type = bool + default = true +} + variable "local_ssd_count" { description = "The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd." type = number From 8da491b258633877102af41aa66bc001d5efa9ec Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Wed, 14 Sep 2022 09:51:28 -0700 Subject: [PATCH 14/51] Fix README.md to match variables.tf --- modules/compute/vm-instance/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 34b390d1f9..c3734b5eab 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -154,7 +154,7 @@ No modules. | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone. |
object({
vm_count = number,
availability_domain_count = number,
collocation = string,
})
| `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | +| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [startup\_script](#input\_startup\_script) | Startup script used on the instance | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to attach the VM. | `string` | `null` | no | From ce40ada7c44ad19df45309c23918b186b0cfdedc Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Wed, 14 Sep 2022 10:25:18 -0700 Subject: [PATCH 15/51] Update go.mod and go.sum --- go.mod | 2 ++ go.sum | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index ea86b39003..b2ab7ea13c 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,8 @@ require ( github.com/spf13/afero v1.9.2 github.com/spf13/cobra v1.5.0 github.com/zclconf/go-cty v1.10.0 + golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect + golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612 gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f gopkg.in/yaml.v3 v3.0.1 diff --git a/go.sum b/go.sum index 126b63bd0e..dc72cc4331 100644 --- a/go.sum +++ b/go.sum @@ -409,8 +409,9 @@ golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e h1:TsQ7F31D3bUCLeqPT0u+yjp1guoArKaNKmCr22PYgTQ= golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -507,8 +508,9 @@ golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220517195934-5e4e11fc645e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220610221304-9f5ed59c137d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220624220833-87e55d714810 h1:rHZQSjJdAI4Xf5Qzeh2bBc5YJIkPFVM6oDtMFYmgws0= golang.org/x/sys v0.0.0-20220624220833-87e55d714810/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From bbd7748c078b18a401c712a9a5960d7c1222ba71 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Wed, 14 Sep 2022 17:47:11 +0000 Subject: [PATCH 16/51] Default slurm_cluster_name to deploy name in hybrid Updates the schedmd-slurm-gcp-v5-hybrid module to default the `slurm_cluster_name` to a filtered version of the `deployment_name` if not explicitly provided. Implementation matches that of login and controller modules. --- .../schedmd-slurm-gcp-v5-hybrid/README.md | 3 ++- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 8 +++++++- .../schedmd-slurm-gcp-v5-hybrid/variables.tf | 17 +++++++++++------ .../test_configs/hpc-cluster-hybrid-v5.yaml | 17 ++++++++--------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index c4a479d296..9280d07fd2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -189,6 +189,7 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = bool
resume_rate = number
resume_timeout = number
suspend_rate = number
suspend_timeout = number
})
|
{
"no_comma_params": false,
"resume_rate": 0,
"resume_timeout": 300,
"suspend_rate": 0,
"suspend_timeout": 300
}
| no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | +| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.
NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.
NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | @@ -204,7 +205,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directroy of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided
it will default to the first 8 characters of the deployment name (removing
any invalid characters). | `string` | `null` | no | | [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").
See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | | [slurm\_depends\_on](#input\_slurm\_depends\_on) | Custom terraform dependencies without replacement on delta. This is useful to
ensure order of resource creation.
NOTE: Also see terraform meta-argument 'depends\_on'. | `list(string)` | `[]` | no | | [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 9bc3fa4e00..6354a1bfc9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -22,13 +22,19 @@ locals { install_dir = var.install_dir != null ? var.install_dir : abspath(var.output_dir) install_dir_pattern = replace(local.install_dir, ".", "\\.") install_path_cmd = "sed -i -E 's|Program=/.*/(resume\\|suspend).py|Program=${local.install_dir_pattern}/\\1\\.py|g' cloud.conf" + + # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning + # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string + tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) + slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name + } module "slurm_controller_instance" { source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=v5.1.0" project_id = var.project_id - slurm_cluster_name = var.slurm_cluster_name + slurm_cluster_name = local.slurm_cluster_name enable_devel = var.enable_devel enable_cleanup_compute = var.enable_cleanup_compute enable_cleanup_subscriptions = var.enable_cleanup_subscriptions diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf index de51d540a5..a382028690 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf @@ -20,14 +20,19 @@ variable "project_id" { description = "Project ID to create resources in." } -variable "slurm_cluster_name" { +variable "deployment_name" { + description = "Name of the deployment." type = string - description = "Cluster name, used for resource naming and slurm accounting." +} - validation { - condition = can(regex("(^[a-z][a-z0-9]*$)", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be composed of only alphanumeric values and begin with a leter. regex: '(^[a-z][a-z0-9]*$)'." - } +variable "slurm_cluster_name" { + type = string + description = <<-EOD + Cluster name, used for resource naming and slurm accounting. If not provided + it will default to the first 8 characters of the deployment name (removing + any invalid characters). + EOD + default = null } variable "enable_devel" { diff --git a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml index ee9b5c544f..7959e2271c 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml @@ -21,23 +21,22 @@ vars: deployment_name: hybrid-controller region: us-central1 zone: us-central1-c - slurm_cluster_name: hybrid on_prem_controller_host_name: static-controller.c.PROJECT_NAME.internal ## .c..internal on_prem_network_storage_ip: storage-ip-placeholder ## internal ip address for nfs to be mounted deployment_groups: - group: primary modules: - - source: modules/network/pre-existing-vpc + - id: network1 + source: modules/network/pre-existing-vpc kind: terraform - id: network1 settings: network_name: cloud-vpc-network subnetwork_name: primary-subnet - - source: modules/file-system/pre-existing-network-storage + - id: pre-existing-storage + source: modules/file-system/pre-existing-network-storage kind: terraform - id: pre-existing-storage outputs: - network_storage settings: @@ -46,9 +45,9 @@ deployment_groups: local_mount: /home fs_type: nfs - - source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition + - id: compute-partition + source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition kind: terraform - id: compute-partition use: [network1] settings: partition_name: cloud @@ -58,9 +57,9 @@ deployment_groups: partition_conf: Default: NO - - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid + - id: slurm-controller + source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid kind: terraform - id: slurm-controller use: [compute-partition, pre-existing-storage] settings: output_dir: ./hybrid From ff763595481a97b7f420f3fbc151115ed63f6e48 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 14 Sep 2022 17:32:54 -0700 Subject: [PATCH 17/51] Upgrade DDN-EXAScaler to v6.1.0 --- community/modules/file-system/DDN-EXAScaler/README.md | 6 +++--- community/modules/file-system/DDN-EXAScaler/main.tf | 2 +- community/modules/file-system/DDN-EXAScaler/variables.tf | 8 +++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 995538ece6..1e0ad80581 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -61,7 +61,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | 3eec46e | +| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | 78deadb | ## Resources @@ -71,11 +71,11 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
})
|
{
"auto_delete": true,
"disk_type": "pd-standard"
}
| no | +| [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
script_url = string
})
|
{
"auto_delete": true,
"disk_type": "pd-standard",
"script_url": null
}
| no | | [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | -| [image](#input\_image) | Source image properties |
object({
project = string
name = string
})
|
{
"name": "exascaler-cloud-v523-centos7",
"project": "ddn-public"
}
| no | +| [image](#input\_image) | Source image properties |
object({
project = string
family = string
})
|
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | | [labels](#input\_labels) | Labels to add to EXAScaler Cloud deployment. List of key key, value pairs. | `any` | `{}` | no | | [local\_mount](#input\_local\_mount) | Mountpoint (at the client instances) for this EXAScaler system | `string` | `"/shared"` | no | | [mds](#input\_mds) | Metadata server properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-32",
"public_ip": true
}
| no | diff --git a/community/modules/file-system/DDN-EXAScaler/main.tf b/community/modules/file-system/DDN-EXAScaler/main.tf index eeb9ecbd5e..e28b11e6e5 100644 --- a/community/modules/file-system/DDN-EXAScaler/main.tf +++ b/community/modules/file-system/DDN-EXAScaler/main.tf @@ -36,7 +36,7 @@ locals { } module "ddn_exascaler" { - source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=3eec46e" + source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=78deadb" fsname = var.fsname zone = var.zone project = var.project_id diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index ed1b0caf1e..902ca23e81 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -189,25 +189,27 @@ variable "boot" { type = object({ disk_type = string auto_delete = bool + script_url = string }) default = { disk_type = "pd-standard" auto_delete = true + script_url = null } } # Source image properties # project: project name -# name: image name +# family: image family name variable "image" { description = "Source image properties" type = object({ project = string - name = string + family = string }) default = { project = "ddn-public" - name = "exascaler-cloud-v523-centos7" + family = "exascaler-cloud-6-1-centos" } } From f88ccc1134aac60f61412574f72c45a76b787309 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 15 Sep 2022 21:05:46 +0000 Subject: [PATCH 18/51] Add Epilog/Prolog scripts to install path in hybrid Updated the install path sed command to also catch EpilogSlurmctld and PrologSlurmctld when exclusive partitions are being used. Updated the hybrid test config to include an exclusive partition and to reformat module definitions in line with recent improvements in the codebase. --- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../test_configs/hpc-cluster-hybrid-v5.yaml | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 6354a1bfc9..dee760d916 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -21,7 +21,7 @@ locals { }] install_dir = var.install_dir != null ? var.install_dir : abspath(var.output_dir) install_dir_pattern = replace(local.install_dir, ".", "\\.") - install_path_cmd = "sed -i -E 's|Program=/.*/(resume\\|suspend).py|Program=${local.install_dir_pattern}/\\1\\.py|g' cloud.conf" + install_path_cmd = "sed -i -E 's|(Program\\|logSlurmctld)=/.*/(resume\\|suspend).py|\\1=${local.install_dir_pattern}/\\2\\.py|g' cloud.conf" # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string diff --git a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml index 7959e2271c..d7b1bed830 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml @@ -29,14 +29,12 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform settings: network_name: cloud-vpc-network subnetwork_name: primary-subnet - id: pre-existing-storage source: modules/file-system/pre-existing-network-storage - kind: terraform outputs: - network_storage settings: @@ -45,22 +43,28 @@ deployment_groups: local_mount: /home fs_type: nfs - - id: compute-partition + - id: debug-partition source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: [network1] settings: - partition_name: cloud + partition_name: debug node_count_dynamic_max: 10 exclusive: false machine_type: n2-standard-2 partition_conf: Default: NO + - id: compute-partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + settings: + partition_name: compute + node_count_dynamic_max: 20 + - id: slurm-controller source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - kind: terraform - use: [compute-partition, pre-existing-storage] + use: [debug-partition, compute-partition, pre-existing-storage] settings: output_dir: ./hybrid slurm_bin_dir: /usr/local/bin From 925880bc31914547e10bbe7542e444b9ad1cbcd8 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 15 Sep 2022 15:16:11 -0700 Subject: [PATCH 19/51] Integrate DDN Lustre install script with startup-script --- .../file-system/DDN-EXAScaler/README.md | 43 ++++++++++++++++++- .../file-system/DDN-EXAScaler/outputs.tf | 35 ++++++++++++++- .../blueprints/lustre-with-new-vpc.yaml | 20 ++++++--- 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 1e0ad80581..bad224e570 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -21,7 +21,45 @@ More information about the architecture can be found at [marketplace]: https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud [architecture]: https://cloud.google.com/architecture/lustre-architecture +## Mounting + +To mount the DDN EXAScaler Lustre file system you must first install the DDN +Luster client and then call the proper `mount` command. + +When mounting to a Slurm resource both of these steps are automatically handled +with the use of the `use` command. See the +[hpc-cluster-high-io](../../../../examples/hpc-cluster-high-io.yaml) for an +example of using this module with Slurm. + +The DDN-EXAScaler module outputs runners that can be used with the +startup-script module to install the client and mount the file system when +mounting to other compute resources such as `vm-instance` or `cloud-batch-job`. +See the following example: + +```yaml + - id: lustrefs + source: community/modules/file-system/DDN-EXAScaler + use: [network1] + settings: {local_mount: /scratch} + + - id: mount-at-startup + source: modules/scripts/startup-script + settings: + runners: + - $(lustrefs.install_ddn_lustre_client_runner) + - $(lustrefs.mount_runner) + + - id: workstation + source: modules/compute/vm-instance + use: [network1, lustrefs, mount-at-startup] +``` + +See [additional documentation][ddn-install-docs] from DDN EXAScaler. + +[ddn-install-docs]: https://github.com/DDNStorage/exascaler-cloud-terraform/tree/master/gcp#install-new-exascaler-cloud-clients + ## Support + EXAScaler Cloud includes self-help support with access to publicly available documents and videos. Premium support includes 24x7x365 access to DDN's experts, along with support community access, automated notifications of updates and @@ -101,8 +139,11 @@ No resources. | Name | Description | |------|-------------| +| [client\_config](#output\_client\_config) | Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler. | | [http\_console](#output\_http\_console) | HTTP address to access the system web console. | -| [mount\_command](#output\_mount\_command) | Command to mount the file system. | +| [install\_ddn\_lustre\_client\_runner](#output\_install\_ddn\_lustre\_client\_runner) | Runner that encapsulates the `client_config` output on this module. | +| [mount\_command](#output\_mount\_command) | Command to mount the file system. `client_config` script must be run first. | +| [mount\_runner](#output\_mount\_runner) | Runner to mount the DDN EXAScaler Lustre file system | | [network\_storage](#output\_network\_storage) | Describes a EXAScaler system to be mounted by other systems. | | [private\_addresses](#output\_private\_addresses) | Private IP addresses for all instances. | | [ssh\_console](#output\_ssh\_console) | Instructions to ssh into the instances. | diff --git a/community/modules/file-system/DDN-EXAScaler/outputs.tf b/community/modules/file-system/DDN-EXAScaler/outputs.tf index 7a2da4c7bb..1343799a34 100644 --- a/community/modules/file-system/DDN-EXAScaler/outputs.tf +++ b/community/modules/file-system/DDN-EXAScaler/outputs.tf @@ -24,9 +24,39 @@ output "ssh_console" { value = module.ddn_exascaler.ssh_console } +output "client_config" { + description = "Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler." + value = module.ddn_exascaler.client_config +} + +output "install_ddn_lustre_client_runner" { + description = "Runner that encapsulates the `client_config` output on this module." + value = { + "type" = "shell" + "content" = module.ddn_exascaler.client_config + "destination" = "install_ddn_lustre_client.sh" + } +} + +locals { + split_mount_cmd = split(" ", module.ddn_exascaler.mount_command) + split_mount_cmd_wo_mountpoint = slice(local.split_mount_cmd, 0, length(local.split_mount_cmd) - 1) + mount_cmd = "${join(" ", local.split_mount_cmd_wo_mountpoint)} ${var.local_mount}" + mount_cmd_w_mkdir = "mkdir -p ${var.local_mount} && ${local.mount_cmd}" +} + output "mount_command" { - description = "Command to mount the file system." - value = module.ddn_exascaler.mount_command + description = "Command to mount the file system. `client_config` script must be run first." + value = local.mount_cmd_w_mkdir +} + +output "mount_runner" { + description = "Runner to mount the DDN EXAScaler Lustre file system" + value = { + "type" = "shell" + "content" = local.mount_cmd_w_mkdir + "destination" = "mount-ddn-lustre.sh" + } } output "http_console" { @@ -34,6 +64,7 @@ output "http_console" { value = module.ddn_exascaler.http_console } + output "network_storage" { description = "Describes a EXAScaler system to be mounted by other systems." value = { diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml index f3d0bfb097..2fa0df824e 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml @@ -41,15 +41,18 @@ deployment_groups: settings: local_mount: /home - # Explicitly picking the local version of the module - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform + use: [network1] settings: local_mount: /scratch - network_self_link: $(network1.network_self_link) - subnetwork_self_link: $(network1.subnetwork_self_link) - subnetwork_address: $(network1.subnetwork_address) + + - id: mount-exascaler + source: modules/scripts/startup-script + settings: + runners: + - $(scratchfs.install_ddn_lustre_client_runner) + - $(scratchfs.mount_runner) # Create a separate workstation to catch regressions in vm-instance - id: workstation @@ -58,11 +61,16 @@ deployment_groups: use: - network1 - homefs - - scratchfs + - mount-exascaler settings: name_prefix: test-workstation machine_type: c2-standard-4 + - id: wait0 + source: ./community/modules/scripts/wait-for-startup + settings: + instance_name: ((module.workstation.name[0])) + - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition kind: terraform From 6140385b3636bc7454cf17e4b7440f8bee2b11b4 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 15 Sep 2022 23:15:47 +0000 Subject: [PATCH 20/51] Make install path sed command more readable Splits up the install path sed command into more readable chunks and adds a comment explaining the rational and what the command is doing. --- .../scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index dee760d916..d27eec58b3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -19,9 +19,19 @@ locals { filename = "ghpc_startup.sh" content = var.compute_startup_script }] + + # Install Directory Variables + # In order to allow the hybrid module to run in a different environment than + # the controller, certain paths need to be updated to match the anticpated + # install directory on the controller. This is done with a sed command that + # find all matching variables with names ending in Program (SuspendProgram, + # etc) or logSlurmctld (EpilogSlurmctld, etc) and replaces the path before + # suspend.py or resume.py with the user provided install_dir. install_dir = var.install_dir != null ? var.install_dir : abspath(var.output_dir) install_dir_pattern = replace(local.install_dir, ".", "\\.") - install_path_cmd = "sed -i -E 's|(Program\\|logSlurmctld)=/.*/(resume\\|suspend).py|\\1=${local.install_dir_pattern}/\\2\\.py|g' cloud.conf" + match_pattern = "(Program\\|logSlurmctld)=/.*/(resume\\|suspend).py" + replace_pattern = "\\1=${local.install_dir_pattern}/\\2\\.py" + install_path_cmd = "sed -i -E 's|${local.match_pattern}|${local.replace_pattern}|g' cloud.conf" # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string @@ -56,6 +66,8 @@ module "slurm_controller_instance" { disable_default_mounts = var.disable_default_mounts } +# Null resource that injects the installation path before the resume/suspend +# scripts in the hybrid configuration files. resource "null_resource" "set_prefix_cloud_conf" { depends_on = [ module.slurm_controller_instance From 146ae5bcaa226183c00b52c7c7ce4946c6031c6c Mon Sep 17 00:00:00 2001 From: Karim Roukoz Date: Fri, 16 Sep 2022 03:02:35 +0000 Subject: [PATCH 21/51] remove "kind:" from examples and docs where optional --- tools/validate_configs/test_configs/packer.yaml | 1 + .../test_configs/threads_per_core.yaml | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/tools/validate_configs/test_configs/packer.yaml b/tools/validate_configs/test_configs/packer.yaml index 2b13cdad3d..24af11c96d 100644 --- a/tools/validate_configs/test_configs/packer.yaml +++ b/tools/validate_configs/test_configs/packer.yaml @@ -29,6 +29,7 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc + kind: terraform - group: packer modules: - id: my-custom-image diff --git a/tools/validate_configs/test_configs/threads_per_core.yaml b/tools/validate_configs/test_configs/threads_per_core.yaml index a06b86feac..de06cab879 100644 --- a/tools/validate_configs/test_configs/threads_per_core.yaml +++ b/tools/validate_configs/test_configs/threads_per_core.yaml @@ -30,9 +30,11 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/pre-existing-vpc + kind: terraform - id: n1-2-threads source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -42,6 +44,7 @@ deployment_groups: - id: n1-1-thread source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -51,6 +54,7 @@ deployment_groups: - id: n1-0-threads source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -60,6 +64,7 @@ deployment_groups: - id: n1-null-threads source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -69,6 +74,7 @@ deployment_groups: - id: n2-2-threads source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -78,6 +84,7 @@ deployment_groups: - id: n2-1-thread source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -87,6 +94,7 @@ deployment_groups: - id: c2-2-threads source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -96,6 +104,7 @@ deployment_groups: - id: c2-1-thread source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -105,6 +114,7 @@ deployment_groups: - id: e2-medium-0-thread source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: @@ -114,6 +124,7 @@ deployment_groups: - id: e2-medium-null-thread source: ./modules/compute/vm-instance + kind: terraform use: - network1 settings: From 94a9a3a16124a30c10ff3c3b5c73b57bea630941 Mon Sep 17 00:00:00 2001 From: Karim Roukoz Date: Fri, 16 Sep 2022 03:11:26 +0000 Subject: [PATCH 22/51] remove "kind:" from examples and docs where optional --- .../test_configs/hpc-cluster-hybrid-v5.yaml | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml index 205b7ee063..d7b1bed830 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml @@ -21,21 +21,20 @@ vars: deployment_name: hybrid-controller region: us-central1 zone: us-central1-c - slurm_cluster_name: hybrid on_prem_controller_host_name: static-controller.c.PROJECT_NAME.internal ## .c..internal on_prem_network_storage_ip: storage-ip-placeholder ## internal ip address for nfs to be mounted deployment_groups: - group: primary modules: - - source: modules/network/pre-existing-vpc - id: network1 + - id: network1 + source: modules/network/pre-existing-vpc settings: network_name: cloud-vpc-network subnetwork_name: primary-subnet - - source: modules/file-system/pre-existing-network-storage - id: pre-existing-storage + - id: pre-existing-storage + source: modules/file-system/pre-existing-network-storage outputs: - network_storage settings: @@ -44,20 +43,28 @@ deployment_groups: local_mount: /home fs_type: nfs - - source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition - id: compute-partition + - id: debug-partition + source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition use: [network1] settings: - partition_name: cloud + partition_name: debug node_count_dynamic_max: 10 exclusive: false machine_type: n2-standard-2 partition_conf: Default: NO - - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - id: slurm-controller - use: [compute-partition, pre-existing-storage] + - id: compute-partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + settings: + partition_name: compute + node_count_dynamic_max: 20 + + - id: slurm-controller + source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid + use: [debug-partition, compute-partition, pre-existing-storage] settings: output_dir: ./hybrid slurm_bin_dir: /usr/local/bin From 36da9137802c58a49294182457d149b779b1bb9b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 15 Sep 2022 22:37:57 +0000 Subject: [PATCH 23/51] Bump cloud.google.com/go/compute from 1.9.0 to 1.10.0 Bumps [cloud.google.com/go/compute](https://github.com/googleapis/google-cloud-go) from 1.9.0 to 1.10.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/pubsub/v1.9.0...pubsub/v1.10.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/compute dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 6 ++---- go.sum | 28 +++++++++++++++++----------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index b2ab7ea13c..32ccc52adc 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module hpc-toolkit go 1.16 require ( - cloud.google.com/go/compute v1.9.0 + cloud.google.com/go/compute v1.10.0 github.com/hashicorp/go-getter v1.6.2 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.14.0 @@ -16,9 +16,7 @@ require ( github.com/spf13/afero v1.9.2 github.com/spf13/cobra v1.5.0 github.com/zclconf/go-cty v1.10.0 - golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect - golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect - google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612 + google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go.sum b/go.sum index dc72cc4331..27061ad971 100644 --- a/go.sum +++ b/go.sum @@ -32,6 +32,7 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1 h1:vpK6iQWv/2uUeFJth4/cBHsQAGjn1iIE6AAlxipRaA0= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= +cloud.google.com/go/asset v1.5.0/go.mod h1:5mfs8UvcM5wHhqtSv8J1CtxxaQq3AdBxxQi2jGW/K4o= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= @@ -44,8 +45,8 @@ cloud.google.com/go/compute v1.5.0/go.mod h1:9SMHyhJlzhlkJqrPAc839t2BZFTSk6Jdj6m cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz/FMzPu0s= cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= -cloud.google.com/go/compute v1.9.0 h1:ED/FP4xv8GJw63v556/ASNc1CeeLUO2Bs8nzaHchkHg= -cloud.google.com/go/compute v1.9.0/go.mod h1:lWv1h/zUWTm/LozzfTJhBSkd6ShQq8la8VeeuOEGxfY= +cloud.google.com/go/compute v1.10.0 h1:aoLIYaA1fX3ywihqpBk2APQKOo20nXsp1GEZQbx5Jk4= +cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/iam v0.3.0 h1:exkAomrVUuzx9kWFI1wm3KI0uoDeUFPB4kKGzx6x+Gc= @@ -54,6 +55,7 @@ cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2k cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/security v1.5.0/go.mod h1:lgxGdyOKKjHL4YG3/YwIL2zLqMFCKs0UbQwgyZmfJl4= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= @@ -410,8 +412,8 @@ golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591 h1:D0B/7al0LLrVC8aWF4+oxpv/m8bc7ViFfVS8/gXGdqI= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -432,8 +434,10 @@ golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220411215720-9780585627b5/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220608161450-d0670ef3b1eb/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= -golang.org/x/oauth2 v0.0.0-20220622183110-fd043fe589d2 h1:+jnHzr9VPj32ykQVai5DNahi9+NSp7yYuCsl5eAQtL0= golang.org/x/oauth2 v0.0.0-20220622183110-fd043fe589d2/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= +golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -509,8 +513,8 @@ golang.org/x/sys v0.0.0-20220517195934-5e4e11fc645e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220610221304-9f5ed59c137d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220624220833-87e55d714810/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 h1:WIoqL4EROvwiPdUtaip4VcDdpZ4kha7wBWZrbVKCIZg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -625,8 +629,9 @@ google.golang.org/api v0.75.0/go.mod h1:pU9QmyHLnzlpar1Mjt4IbapUCy8J+6HD6GeELN69 google.golang.org/api v0.78.0/go.mod h1:1Sg78yoMLOhlQTeF+ARBoytAcH1NNyyl390YMy6rKmw= google.golang.org/api v0.80.0/go.mod h1:xY3nI94gbvBrE0J6NHXhxOmW97HG7Khjkku6AFB3Hyg= google.golang.org/api v0.84.0/go.mod h1:NTsGnUFJMYROtiquksZHBWtHfeMC7iYthki7Eq3pa8o= -google.golang.org/api v0.91.0 h1:731+JzuwaJoZXRQGmPoBiV+SrsAfUaIkdMCWTcQNPyA= -google.golang.org/api v0.91.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.93.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.96.0 h1:F60cuQPJq7K7FzsxMYHAUJSiXh2oKctHxBMbDygxhfM= +google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -717,8 +722,9 @@ google.golang.org/genproto v0.0.0-20220608133413-ed9918b62aac/go.mod h1:KEWEmljW google.golang.org/genproto v0.0.0-20220616135557-88e70c0c3a90/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220624142145-8cd45d7dbd1f/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= -google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612 h1:NX3L5YesD5qgxxrPHdKqHH38Ao0AG6poRXG+JljPsGU= -google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612/go.mod h1:iHe1svFLAZg9VWz891+QbRMwUv9O/1Ww+/mngYeThbc= +google.golang.org/genproto v0.0.0-20220815135757-37a418bb8959/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de h1:5ANeKFmGdtiputJJYeUVg8nTGA/1bEirx4CgzcnPSx8= +google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From 93df5a3fdaa98cfb46c87be086e6c3cdb82dbb3e Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 15 Sep 2022 22:59:39 -0500 Subject: [PATCH 24/51] Upgrade Cloud Storage Go module --- go.mod | 1 + go.sum | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 32ccc52adc..9dcee3402c 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.16 require ( cloud.google.com/go/compute v1.10.0 + cloud.google.com/go/storage v1.26.0 // indirect github.com/hashicorp/go-getter v1.6.2 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.14.0 diff --git a/go.sum b/go.sum index 27061ad971..f485fbc57a 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,9 @@ cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohl cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3fOKtUw0Xmo= -cloud.google.com/go/storage v1.22.1 h1:F6IlQJZrZM++apn9V5/VfS3gbTUYg98PS3EMQAzqtfg= cloud.google.com/go/storage v1.22.1/go.mod h1:S8N1cAStu7BOeFfE8KAQzmyyLkK8p/vmRq6kuBTW58Y= +cloud.google.com/go/storage v1.26.0 h1:lYAGjknyDJirSzfwUlkv4Nsnj7od7foxQNH/fqZqles= +cloud.google.com/go/storage v1.26.0/go.mod h1:mk/N7YwIKEWyTvXAWQCIeiCTdLoRH6Pd5xmSnolQLTI= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= @@ -207,7 +208,6 @@ github.com/googleapis/gax-go/v2 v2.2.0/go.mod h1:as02EH8zWkzwUoLbBaFeQ+arQaj/Oth github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99EXz9pXxye9YM= github.com/googleapis/gax-go/v2 v2.4.0 h1:dS9eYAjhrE2RjmzYw2XAPvcXfmcQLtFEQWn0CR82awk= github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= -github.com/googleapis/go-type-adapters v1.0.0 h1:9XdMn+d/G57qq1s8dNc5IesGCXHf6V2HZ2JwRxfA2tA= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= @@ -630,6 +630,7 @@ google.golang.org/api v0.78.0/go.mod h1:1Sg78yoMLOhlQTeF+ARBoytAcH1NNyyl390YMy6r google.golang.org/api v0.80.0/go.mod h1:xY3nI94gbvBrE0J6NHXhxOmW97HG7Khjkku6AFB3Hyg= google.golang.org/api v0.84.0/go.mod h1:NTsGnUFJMYROtiquksZHBWtHfeMC7iYthki7Eq3pa8o= google.golang.org/api v0.93.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.94.0/go.mod h1:eADj+UBuxkh5zlrSntJghuNeg8HwQ1w5lTKkuqaETEI= google.golang.org/api v0.96.0 h1:F60cuQPJq7K7FzsxMYHAUJSiXh2oKctHxBMbDygxhfM= google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= @@ -722,6 +723,7 @@ google.golang.org/genproto v0.0.0-20220608133413-ed9918b62aac/go.mod h1:KEWEmljW google.golang.org/genproto v0.0.0-20220616135557-88e70c0c3a90/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220624142145-8cd45d7dbd1f/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= +google.golang.org/genproto v0.0.0-20220810155839-1856144b1d9c/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= google.golang.org/genproto v0.0.0-20220815135757-37a418bb8959/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de h1:5ANeKFmGdtiputJJYeUVg8nTGA/1bEirx4CgzcnPSx8= google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= From d5ceb3a65860fee0dd2578941a8a30848ebdf3e4 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 16 Sep 2022 08:21:33 -0700 Subject: [PATCH 25/51] Warn users about deprecated 'name' argument for EXAScaler image --- .../modules/file-system/DDN-EXAScaler/README.md | 2 +- .../file-system/DDN-EXAScaler/variables.tf | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 1e0ad80581..13f18d1472 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -75,7 +75,7 @@ No resources. | [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | -| [image](#input\_image) | Source image properties |
object({
project = string
family = string
})
|
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | +| [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | | [labels](#input\_labels) | Labels to add to EXAScaler Cloud deployment. List of key key, value pairs. | `any` | `{}` | no | | [local\_mount](#input\_local\_mount) | Mountpoint (at the client instances) for this EXAScaler system | `string` | `"/shared"` | no | | [mds](#input\_mds) | Metadata server properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-32",
"public_ip": true
}
| no | diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index 902ca23e81..c9a12771e1 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -201,16 +201,23 @@ variable "boot" { # Source image properties # project: project name # family: image family name +# name: !!DEPRECATED!! - image name variable "image" { description = "Source image properties" - type = object({ - project = string - family = string - }) + type = any + # Ommiting type checking so validation can provide more useful error message + # type = object({ + # project = string + # family = string + # }) default = { project = "ddn-public" family = "exascaler-cloud-6-1-centos" } + validation { + condition = lookup(var.image, "name", null) == null && lookup(var.image, "project", null) != null && lookup(var.image, "family", null) != null + error_message = "Use image.family & image.project to specify the image. Field image.name is deprecated. See EXAScaler documentation for input options:(https://github.com/DDNStorage/exascaler-cloud-terraform/tree/master/gcp#boot-image-options)." + } } # Management server properties From c44b7a46f27e5558b8bb1fde8941d37c85fb90ca Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 16 Sep 2022 08:30:54 -0700 Subject: [PATCH 26/51] Rename EXAScaler output to clarify it is a script --- community/modules/file-system/DDN-EXAScaler/README.md | 6 +++--- community/modules/file-system/DDN-EXAScaler/outputs.tf | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index bad224e570..5c306508ac 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -139,10 +139,10 @@ No resources. | Name | Description | |------|-------------| -| [client\_config](#output\_client\_config) | Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler. | +| [client\_config\_script](#output\_client\_config\_script) | Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler. | | [http\_console](#output\_http\_console) | HTTP address to access the system web console. | -| [install\_ddn\_lustre\_client\_runner](#output\_install\_ddn\_lustre\_client\_runner) | Runner that encapsulates the `client_config` output on this module. | -| [mount\_command](#output\_mount\_command) | Command to mount the file system. `client_config` script must be run first. | +| [install\_ddn\_lustre\_client\_runner](#output\_install\_ddn\_lustre\_client\_runner) | Runner that encapsulates the `client_config_script` output on this module. | +| [mount\_command](#output\_mount\_command) | Command to mount the file system. `client_config_script` must be run first. | | [mount\_runner](#output\_mount\_runner) | Runner to mount the DDN EXAScaler Lustre file system | | [network\_storage](#output\_network\_storage) | Describes a EXAScaler system to be mounted by other systems. | | [private\_addresses](#output\_private\_addresses) | Private IP addresses for all instances. | diff --git a/community/modules/file-system/DDN-EXAScaler/outputs.tf b/community/modules/file-system/DDN-EXAScaler/outputs.tf index 1343799a34..4713a921a5 100644 --- a/community/modules/file-system/DDN-EXAScaler/outputs.tf +++ b/community/modules/file-system/DDN-EXAScaler/outputs.tf @@ -24,13 +24,13 @@ output "ssh_console" { value = module.ddn_exascaler.ssh_console } -output "client_config" { +output "client_config_script" { description = "Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler." value = module.ddn_exascaler.client_config } output "install_ddn_lustre_client_runner" { - description = "Runner that encapsulates the `client_config` output on this module." + description = "Runner that encapsulates the `client_config_script` output on this module." value = { "type" = "shell" "content" = module.ddn_exascaler.client_config @@ -46,7 +46,7 @@ locals { } output "mount_command" { - description = "Command to mount the file system. `client_config` script must be run first." + description = "Command to mount the file system. `client_config_script` must be run first." value = local.mount_cmd_w_mkdir } @@ -64,7 +64,6 @@ output "http_console" { value = module.ddn_exascaler.http_console } - output "network_storage" { description = "Describes a EXAScaler system to be mounted by other systems." value = { From 31a2c6637bcc02ce6f4ab7853b9015b5fe1bbdb0 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Tue, 13 Sep 2022 17:44:29 +0000 Subject: [PATCH 27/51] Add all gcp hybrid slurm demo instructions --- .../blueprints/create-networks.yaml | 49 ++ .../blueprints/hybrid-configuration.yaml | 37 +- .../blueprints/static-cluster.yaml | 65 +++ .../inter-gcp-project-hybrid-slurm.md | 536 ++++++++++++++++++ 4 files changed, 667 insertions(+), 20 deletions(-) create mode 100644 docs/hybrid-slurm-cluster/blueprints/create-networks.yaml rename tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml => docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml (58%) create mode 100644 docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml create mode 100644 docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md diff --git a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml new file mode 100644 index 0000000000..db59df5cfd --- /dev/null +++ b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: peering-networks + +vars: + project_id: # The project ID for the static cluster + project_id_compute: # The project ID for the burst compute VMs + deployment_name: peering-networks-demo + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local or community module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - source: modules/network/vpc + kind: terraform + id: network0 + settings: + network_name: static-cluster-network + subnetwork_name: primary-subnet + network_address_range: 10.0.0.0/16 + subnetwork_size: 0 + + - source: modules/network/vpc + kind: terraform + id: network1 + settings: + network_name: compute-vpc-network + subnetwork_name: primary-subnet + project_id: $(vars.project_id_compute) + network_address_range: 10.1.0.0/16 + subnetwork_size: 0 diff --git a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml similarity index 58% rename from tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml rename to docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index d7b1bed830..b7b477db30 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-hybrid-v5.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,12 +17,11 @@ blueprint_name: hpc-cluster-hybrid-v5 vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hybrid-controller + project_id: ## <> + deployment_name: hybrid-config region: us-central1 zone: us-central1-c - on_prem_controller_host_name: static-controller.c.PROJECT_NAME.internal ## .c..internal - on_prem_network_storage_ip: storage-ip-placeholder ## internal ip address for nfs to be mounted + static_controller_hostname: ## <>.c.<>.internal deployment_groups: - group: primary @@ -30,29 +29,24 @@ deployment_groups: - id: network1 source: modules/network/pre-existing-vpc settings: - network_name: cloud-vpc-network + network_name: compute-vpc-network subnetwork_name: primary-subnet - - id: pre-existing-storage - source: modules/file-system/pre-existing-network-storage - outputs: - - network_storage + - id: scratchfs + source: modules/file-system/filestore + use: [network1] settings: - server_ip: $(vars.on_prem_network_storage_ip) - remote_mount: /exports/home - local_mount: /home - fs_type: nfs + local_mount: /scratch - id: debug-partition - source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition use: [network1] settings: partition_name: debug node_count_dynamic_max: 10 exclusive: false machine_type: n2-standard-2 - partition_conf: - Default: NO + is_default: false - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -63,10 +57,13 @@ deployment_groups: node_count_dynamic_max: 20 - id: slurm-controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - use: [debug-partition, compute-partition, pre-existing-storage] + source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid + use: + - debug-partition + - compute-partition + - scratchfs settings: output_dir: ./hybrid slurm_bin_dir: /usr/local/bin - slurm_control_host: $(vars.on_prem_controller_host_name) + slurm_control_host: $(vars.static_controller_hostname) install_dir: /etc/slurm/hybrid diff --git a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml new file mode 100644 index 0000000000..9cc3f35a73 --- /dev/null +++ b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml @@ -0,0 +1,65 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: static-slurm-cluster + +vars: + project_id: ## <> + deployment_name: static-slurm-cluster + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/pre-existing-vpc + settings: + network_name: static-cluster-network + subnetwork_name: primary-subnet + + - id: scratchfs + source: modules/file-system/filestore + use: [network1] + settings: + local_mount: /scratch + + - id: static_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + settings: + partition_name: static + node_count_static: 4 + node_count_dynamic_max: 0 + enable_placement: false + machine_type: n2-standard-2 + is_default: true + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + - static_partition + - scratchfs + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + settings: + machine_type: n2-standard-4 diff --git a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md new file mode 100644 index 0000000000..858404b61b --- /dev/null +++ b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md @@ -0,0 +1,536 @@ +# Hybrid Slurm Cluster Demonstration With GCP Static Cluster + +## Description +These instructions step through the setup and execution of a demo of the HPC +Toolkit hybrid module. In this process you will: + +* Setup networking and internal DNS peering between 2 GCP projects +* Deploy a [static cluster](#deploy-a-static-cluster) that will simulate an + on-premise cluster using the HPC Toolkit and + [SchedMD's Slurm on GCP][slurm-gcp] terraform modules. +* Create and deploy a hybrid deployment directory using the HPC Toolkit +* Run through a few manual steps of integrating the hybrid configurations + created with the hybrid deployment directory. +* Test the new hybrid controller. + +These instructions are provided for demonstration purposes only. This process +may serve as a first step in evaluating the HPC Toolkit's hybrid slurm module +for use with an on-premise slurm-cluster. + +> **Warning:** The [hybrid module][hybridmodule] is in active development and +> the interface is not guaranteed to be static. As the module matures and +> further testing is done, documentation on applying the hybrid module to +> on-premise slurm clusters will be added and expanded. + +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5.1.0 + +## Definitions + +**_static cluster:_** The static cluster will simulate an on-premise slurm cluster +for the purposes of this all-GCP demo. The static cluster will be deployed with +slurm-gcp and optionally have a set of statically created VMs populating it's +local partition. + +**hybrid deployment:** A deployment using the [schedmd-slurm-gcp-v5-hybrid] +module. The deployment itself includes the hybrid configuration directory as +well as metadata in the cloud bursting project. + +**hybrid configuration directory:** The directory created locally by the +[hybrid module][hybridmodule]. This directory contains the required +configuration files and scripts needed to convert a static cluster to a cloud +hybrid cluster. + +[hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md + +**cloud bursting:** Cloud bursting refers to creating new compute VM instances +in the cloud elastically that can be used to complete slurm jobs. + +**compute node:** In this document, a compute node specifically will refer to +the compute VM instances created by the hybrid configuration. + +## More Information +To learn more about the underlying terraform modules that support this demo, you +can visit the [slurm-gcp] repo. Specifically, the hybrid documentation can be +found at [docs/hybrid.md][slurm-gcp-hybrid]. + +## Blueprints + +* [create-networks.yaml] creates VPC networks in 2 projects with IP ranges that + are suitable for setting up bidirectional network peering. These networks will + be used by subequent blueprints. +* [static-cluster.yaml] defines a slurm cluster with 4 static nodes that will be + used to simulate an on-premise slurm cluster. +* [hybrid-configuration.yaml] sets up the hybrid project and creates a hybrid + configuration directory with all required configurations and scripts. + +[create-networks.yaml]: ./blueprints/create-networks.yaml +[static-cluster.yaml]: ./blueprints/static-cluster.yaml +[hybrid-configuration.yaml]: ./blueprints/hybrid-configuration.yaml + +## Debugging Suggestions + +### Logging +The logs from VMs created by the hybrid configuration will be populated under +`/var/log/slurm/*.log`, a selection of pertinent logs are described below: + +* `slurmctld.log`: The logging information for the slurm controller daemon. Any + issues with the config or permissions will be logged here. +* `slurmd.log`: The logging information for the slurm daemon on the compute + nodes. Any issues with the config or permissions on the compute node can be + found here. Note: These logs require SSH'ing to the compute nodes and viewing + them directly. +* `resume.log`: Output from the resume.py script that is used by hybrid + partitions to create the burst VM instances. Any issues creating new compute + VM nodes will be logged here. + +In addition, any startup failures can be tracked through the logs at +`/var/log/messages` for centos/rhel based images and `/var/log/syslog` for +debian/ubuntu based images. Instructions for viewing these logs can be found in +[Google Cloud docs][view-ss-output]. + +[view-ss-output]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux#viewing-output + +### Connectivity Issues +To verify the network and DNS peering setup was successful, you can create a VM +in each project attached to the networks created in these instructions. You can +run ping `.c..internal` to verify the settings are +correct. This should succeed in both directions. + +If the ping test doesn’t work, the DNS may not be configured correctly, or the +networks may not be able to peer correctly. If it’s the former, you should be +able to ping the internal IP of the other VM. If you cannot, the firewall rule +or network peering setting are likely not correct. + +## Instructions + +### Before you begin +* Build ghpc + +#### Select or Create 2 GCP Projects + +This process will require 2 projects: + +* Project A: Where the simulated “On-premise” static slurm cluster will be + deployed. +* Project B: Where the cloud partitions will create new compute VM instances to + complete slurm jobs. + +Identify the 2 projects you intend to use. "Project A" and "Project B" will be +referred to in the rest of this document based on these definitions. + +#### Enable Required APIs + +The following APIs are required to complete this demo: + +* [Compute Engine API][computeapi] +* [Cloud DNS API][clouddnsapi] + +[computeapi]: https://cloud.google.com/compute/docs/reference/rest/v1 +[clouddnsapi]: https://cloud.google.com/dns/docs/reference/v1 + +#### Set IAM Roles +The service account attaches to the slurm controller in Project A +([see above](#select-or-create-2-gcp-projects)) +must have the Editor role in +Project A and Project B. If not specified, this will be the +[default compute engine service account][computesa]. + +[computesa]:https://cloud.google.com/compute/docs/access/service-accounts#default_service_account + +#### Dependencies +This demo has the same baseline dependencies as the HPC Toolkit that are +outlined in the main [README.md](../../README.md#dependencies). + +In addition, some pip packages need to be installed locally. Run the following +command to install the pip packages outlined in +[requirements.txt](./requirements.txt): + +```shell +pip install -r requirements.txt +``` + +#### Build ghpc + +Before you begin, ensure that you have built the `ghpc` tool in the HPC Toolkit. +For more information see the [README.md](../../README.md#quickstart) Quickstart. + +### Create VPC Networks +A blueprint for creating VPC networks in each project that can support network +and DNS peering can be found at [create-networks.yaml]. This +blueprint will do the following: + +* Create a network named `static-cluster-network` in project A. +* Create a subnetwork of `static-cluster-network` named `primary-subnet` with + an internal IP range of 10.0.0.0/16. +* Create a network named `compute-vpc-network` in project B. +* Create a subnetwork of `compute-vpc-network` named `primary-subnet` with an + internal IP range of 10.1.0.0/16 + +Create a deployment directory for the networks using `ghpc`: + +```shell +ghpc create blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=peering-networks-demo/primary init + terraform -chdir=peering-networks-demo/primary validate + terraform -chdir=peering-networks-demo/primary apply +``` + +Execute the terraform commands to deploy the two networks. + +### Allow Peering Between VPC Networks +Bidirectional VPC and DNS peering is needed between both networks created +in the last step. [VPC peering][netpeering] allows internal IP address +connectivity between the projects. [DNS peering][dnspeering] allows resolution +of the fully qualified hostname of instances in the other project in the current +project. + +These instructions will step you through how to set up both of these peering +connections via the [cloud console][console]. + +[netpeering]: https://cloud.google.com/vpc/docs/vpc-peering +[dnspeering]: https://cloud.google.com/dns/docs/overview +[console]: https://cloud.google.com/cloud-console + +#### Setup VPC Peering +First, set up VPC peering from Project A to Project B: + +* Navigate to the [VPC Network Peering][netpeeringconsole] page in the GCP + console. +* Click on [Create Peering Connection][createpeering]. +* Click "CONTINUE" if prompted to gather additional information (project ID, IP + ranges, etc) +* Provide the following information: + * **_Name:_** The name of the peering connection, for example + "hybrid-demo-network-peering". + * **_Your VPC Network:_** The name of the VPC network in this project created + in the last step, by default "static-cluster-network" for project A and + "compute-vpc-network" for project B. + * **_Peered VPC Network_** Select "In another project" + * **_Project ID:_** The name of the other project. + * **_VPC network name:_** The name of the VPC network in the other project, + "compute-vpc-network" if creating from project A or + "static-cluster-network" if creating from project B. + * All other fields can be left alone. +* Click "CREATE". + +Repeat these same steps in Project B. + +When complete, both [network peering connections][netpeeringconsole] should show +a green check icon and be listed as "Active". + +Next, set up firewall rules in each project that allow data to pass between the +peered networks. Starting in project A, do the following: + +* Navigate to the [VPC Networks][vpcnetworks] page in the GCP console. +* Click on the network created in the prior step, "static-cluster-network" for + project A and "compute-vpc-network" for project B. +* Click on the tab titled "FIREWALLS". +* Click on "ADD FIREWALL RULE". +* Provide the following information: + * **_Name:_** The name of the firewall rule, for example + "allow-peering-connection". + * **_Network:_** The name of the network, this should already be filled in. + * **_Direction of traffic:_** Ingress + * **_Action on match:_** Allow + * **_Targets:_** All instances in the network + * **_Source filter:_** IPv4 ranges + * **_Source IPv4 ranges:_** 10.0.0.0/8 + * **_Protocols and Ports:_** Specified protocols and ports + * TCP: 0-65532 + * UDP: 0-65532 + * Other: icmp +* Click "CREATE" + +Repeat these same steps in Project B. + +[netpeeringconsole]: https://console.cloud.google.com/networking/peering/list +[createpeering]: https://console.cloud.google.com/networking/peering/add +[vpcnetworks]: https://console.cloud.google.com/networking/networks/list + +#### Setup DNS Peering +First, set up private DNS peering from Project A to Project B: + +* Navigate to the [Cloud DNS][dnszones] page in the GCP console. +* Click on "CREATE ZONE". +* Provide the following information: + * **_Zone Type:_** Private + * **_Zone name:_** The name of the DNS zone, for example + "hybrid-demo-dns-zone". + * **_DNS name:_** `c.<>.internal` replacing `<>` + with the project ID of project B. When adding the zone in project B, the + DNS name will be `c.<>.internal`. + * **_Options:_** DNS Peering + * **_Networks:_** The network created in the prior step in this project, + "static-cluster-network" for project A and "compute-vpc-network" for + project B. + * **_Peer Project:_** The project ID of the other project. + * **_Peer Network:_** The network name created in the last step in the peer + project, "compute-vpc-network" if creating from project A or + "static-cluster-network" if creating from project B. +* Click "CREATE" + +Repeat these same steps in Project B. + +[dnszones]: https://console.cloud.google.com/net-services/dns/zones + +### Deploy a Static Cluster + +The blueprint defined by [static-cluster.yaml] in the blueprints directory will +create a new slurm cluster with the following: + +* A pointer to the network created in [Create VPC Networks](#create-vpc-networks) + in project A, "static-cluster-network". +* A new filestore instance that will serve as the local scratch network + filesystem. +* One partition with 4 static nodes (compute VMs that are always up) of machine + type n2-standard-2. This will be the default partition. +* A Slurm controller and login node. + +First, use the HPC Toolkit to create the deployment directory, replacing +"<>" with the ID of your project A: + +```shell +ghpc create blueprints/static-cluster.yaml --vars project_id="<>" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=static-slurm-cluster/primary init + terraform -chdir=static-slurm-cluster/primary validate + terraform -chdir=static-slurm-cluster/primary apply +``` + +Execute the terraform commands to deploy the static Slurm cluster in project A. + +### Use the Cloud HPC Toolkit to Create the Hybrid Deployment Directory +The blueprint for creating a deploying the hybrid configuration can be found in +the blueprints directory as [hybrid-configuration.yaml]. This blueprint defines +a deployment that does the following: + +* Create a pointer to the network in project B created in + [Create VPC Networks](#create-vpc-networks). +* Create a filestore for a cloud scratch network filesystem. +* Create a single partition named "cloud" with a dynamic maximum size of 10 + nodes of machine type n2-standard-2. +* Creates a hybrid configuration using the + [`schedmd-slurm-gcp-v5-hybrid`][hybridmodule] module. This module will do the + following: + * Create a directory at `output_dir` locally containing the hybrid + configuration files and execution scripts. + * Set metadata in project B that inform the burst compute nodes how to + configure themselves. + * Create pubsub actions triggered by changes to the hybrid configuration. + +Either in the blueprint directly or on the command line, update the following +deployment variables in the [hybrid-configuration.yaml] blueprint: + +* **_project\_id:_** The ID of project B. +* **_static\_controller\_hostname:_** The fully qualified internal hostname of + the static cluster's controller in project A. The format is + `<>.c.<>.internal`. + +If the deployment vars have been added directly to the blueprint, the following +command will create the deployment directory: + +```shell +ghpc create blueprints/hybrid-configuration.yaml +``` + +To create the deployment directory with deployment variables passed through the +command line, run the following command with the updated values of +`<>`, `<>` and `<>` instead: + +```shell +ghpc create blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="<>.c.<>.internal" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=hybrid-config/primary init + terraform -chdir=hybrid-config/primary validate + terraform -chdir=hybrid-config/primary apply +``` + +Execute the terraform commands to create the hybrid configuration. A directory +in `hybrid-configuration/primary` named `hyrid/` should be created which +contains a `cloud.conf` file, `cloud_gres.conf` file and a set of support +scripts. + +[hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md + +### Install and Configure Hybrid on the Controller Instance + +> **_NOTE:_** Many of the manual steps in this section have been adapted from the +> hybrid documentation in [Slurm on GCP][slurm-gcp]. The source document can be +> found at [docs/hybrid.md][slurm-gcp-hybrid] + +Now that the hybrid configuration directory has been created, it needs to be +installed on the controller VM instance. First, tar the directory: + +```shell +cd hybrid-config/primary +tar czvf hybrid.tar.gz hybrid +``` + +Copy the `hybrid.tar.gz` file to the controller VM instance. This can be done +in whichever way is easiest for you, `gcloud compute scp` is used here. + +```shell +gcloud compute scp --project="<>" --zone=us-central1-c ./hybrid.tar.gz "<>:~" +``` + +Now SSH to the controller VM either using the console or the following gcloud +command: + +```shell +gcloud compute ssh --project="<>" --zone=us-central1-c "<>" +``` + +Decompress the `hybrid.tar.gz` file: + +```shell +sudo tar xzvf hybrid.tar.gz --directory /etc/slurm +rm hybrid.tar.gz +``` + +Set the correct permissions for the hybrid directory and the files contained in +it: + +```shell +sudo chown -R slurm: /etc/slurm/hybrid +sudo chmod 644 /etc/slurm/hybrid/cloud.conf +sudo chmod 755 /etc/slurm/hybrid +``` + +Because the static cluster was also created by [Slurm on GCP][slurm-gcp] +terraform modules, the partition information must be copied from the file +`/etc/slurm/cloud.conf` to the slurm config file at `/etc/slurm/slurm.conf`. The +lines that need to be copied will look similar to the following block: + +```text +NodeName=DEFAULT State=UNKNOWN RealMemory=7552 Boards=1 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 CPUs=1 +NodeName=staticslur-static-ghpc-[0-3] State=CLOUD +NodeSet=staticslur-static-ghpc Nodes=staticslur-static-ghpc-[0-3] +PartitionName=static Nodes=staticslur-static-ghpc State=UP DefMemPerCPU=7552 SuspendTime=300 Oversubscribe=Exclusive Default=YES + +SuspendExcNodes=staticslur-static-ghpc-[0-3] +``` + +Depending on the configuration of the static partitions, the `SuspendExcNodes` +may not be included. + +These lines can be copied to the bottom of the `slurm.conf` file. + +Make the following changes to the `/etc/slurm/slurm.conf` file: + +* replace `include cloud.conf` with `include hybrid/cloud.conf` +* Add the fully qualified hostname in parentheses after the controller hostname + in the parameter `SlurmctldHost`. + +```text +# slurm.conf +... +SlurmctldHost=<>(<>.c.<>.internal) +... +include hybrid/cloud.conf +... +``` + +Make the following changes to the `/etc/slurm/hybrid/cloud.conf` file: + +* `SlurmctldParameters` + * Remove `cloud_dns` + * Add `cloud_reg_addrs` +* Add `TreeWidth=65533` + +```text +# cloud.conf +... +SlurmctldParameters=idle_on_node_suspend,cloud_reg_addrs +... +TreeWidth=65533 +... +``` + +These changes will inform the controller to use the IP of compute nodes to +communicate rather than the hostnames. + +Next, create a new cronjob as the slurm user that will periodically call the +`/etc/slurm/hybrid/slurmsync.py` file. + +```shell +sudo su slurm +crontab -e +``` + +Since the controller was deployed using [Slurm on GCP][slurm-gcp], there will +already be a cronjob pointing to the `slurmsync.py` script in `/etc/slurm/`, +simply update it to the following: + +```text +*/1 * * * * /etc/slurm/hybrid/slurmsync.py +``` + +Exit the editor and the slurm user when complete. + +Finally, restart the slurmctld service to enable the changes made: + +```shell +sudo systemctl restart slurmctld +``` + +If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` +should point you in the right direction. + +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5.1.0 +[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/v5.1.0/docs/hybrid.md + +### Validate the Hybrid Cluster + +Now that the hybrid configuration has been installed, you can test your new +cloud partition. First off, run `sinfo` to see your partitions listed side by +side: + +```shell +$ sinfo +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +static* up infinite 4 idle staticslur-static-ghpc-[0-3] +cloud up infinite 10 idle~ hybridconf-cloud-ghpc-[0-9] +``` + +To verify that your local partitions are still active, run a simple test with +`srun`: + +```shell +$ srun -N 1 hostname +staticslur-static-ghpc-0 +``` + +Now verify the cloud partition is running with a similar test. Note that since a +node is being created, the same command will take much longer the first time. +Subsequent uses of the cloud nodes before being suspended will be near +instantaneous after the initial startup cost. + +```shell +$ srun -N 1 -p cloud hostname +hybridconf-cloud-ghpc-0 +``` From b5478d49da2cf5c86632719f453ad4fc26aac15e Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Fri, 16 Sep 2022 00:11:15 +0000 Subject: [PATCH 28/51] Add requirements file for pip dependencies --- docs/hybrid-slurm-cluster/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/hybrid-slurm-cluster/requirements.txt diff --git a/docs/hybrid-slurm-cluster/requirements.txt b/docs/hybrid-slurm-cluster/requirements.txt new file mode 100644 index 0000000000..ae1919a138 --- /dev/null +++ b/docs/hybrid-slurm-cluster/requirements.txt @@ -0,0 +1,2 @@ +addict~=2.0 +google-cloud-pubsub~=2.0 \ No newline at end of file From 8676bcf03272acf8bb59da74b1d6d497b0721f8f Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 16 Sep 2022 14:45:28 -0500 Subject: [PATCH 29/51] Address an idempotency in Spack install script Spack installation script adds environment setup command each time the machine boots. This change resolves that by using overwrite stdout redirection rather than append redirection and only creating the appropriate /etc/profile.d script if it does not already exist. --- .../scripts/spack-install/templates/install_spack.tpl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 620ca3e7ff..991f318d12 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -152,7 +152,9 @@ echo "$PREFIX Populating defined buildcaches" %{endif ~} %{endfor ~} -echo "source ${INSTALL_DIR}/share/spack/setup-env.sh" >> /etc/profile.d/spack.sh -chmod a+rx /etc/profile.d/spack.sh +if [ ! -f /etc/profile.d/spack.sh ]; then + echo "source ${INSTALL_DIR}/share/spack/setup-env.sh" > /etc/profile.d/spack.sh + chmod a+rx /etc/profile.d/spack.sh +fi echo "$PREFIX Setup complete..." From f23dbf11556c8d6bc3010dd978cb74b25469f190 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 16 Sep 2022 17:21:54 -0500 Subject: [PATCH 30/51] Eliminate 1 git checkout during Spack install Empirically, git checkout can take several minutes when performed over NFS. The automatic checkout performed during git clone is undesirable because we are going to immediately switch to another git reference. This commit eliminates that checkout. --- .../modules/scripts/spack-install/templates/install_spack.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 991f318d12..b8ddb7975b 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -24,7 +24,7 @@ if [ ! -d ${INSTALL_DIR} ]; then chmod a+rwx ${INSTALL_DIR}; chmod a+s ${INSTALL_DIR}; cd ${INSTALL_DIR}; - git clone ${SPACK_URL} . + git clone --no-checkout ${SPACK_URL} . } &>> ${LOG_FILE} echo "$PREFIX Checking out ${SPACK_REF}..." git checkout ${SPACK_REF} >> ${LOG_FILE} 2>&1 From 75bd872f4539f269e6c0fa43ecefb4d11d1ac4df Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 16 Sep 2022 14:43:30 -0500 Subject: [PATCH 31/51] Replace Spack installation in AMD example with a builder VM --- community/examples/AMD/README.md | 9 ++---- .../examples/AMD/hpc-cluster-amd-slurmv5.yaml | 28 ++++++++++++++++++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md index 1e600c6d3c..05bd740a3f 100644 --- a/community/examples/AMD/README.md +++ b/community/examples/AMD/README.md @@ -75,7 +75,7 @@ remounted and that you should logout and login. Follow its instructions. Once configuration is complete, install AOCC by running: ```shell -sudo -i bash /var/tmp/install_aocc.sh +sudo bash /var/tmp/install_aocc.sh ``` Spack will prompt you to accept the AOCC End User License Agreement by opening a @@ -83,12 +83,7 @@ text file containing information about the license. Leave the file unmodified and write it to disk by typing `:q` as two characters in sequence ([VI help][vihelp]). -Installation of AOCC and OpenMPI will take approximately 15 minutes. Once they -are installed, you can install additional packages such as `amdblis`: - -```shell -sudo -i spack -d install -v amdblis %aocc@3.2.0 -``` +Installation of AOCC and OpenMPI will take approximately 15 minutes. Configure SSH user keys for access between cluster nodes: diff --git a/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml b/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml index e6e06ed895..0c8164151c 100644 --- a/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml +++ b/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml @@ -65,8 +65,25 @@ deployment_groups: - type: shell source: modules/startup-script/examples/install_ansible.sh destination: install_ansible.sh + - $(swfs.install_nfs_client_runner) + - $(swfs.mount_runner) - $(spack.install_spack_deps_runner) - $(spack.install_spack_runner) + - type: shell + content: "shutdown -h +1" + destination: shutdown.sh + + - id: slurm_startup + source: modules/scripts/startup-script + settings: + runners: + - type: data + destination: /etc/profile.d/spack.sh + content: | + #!/bin/sh + if [ -f /sw/spack/share/spack/setup-env.sh ]; then + . /sw/spack/share/spack/setup-env.sh + fi # the following installation of AOCC may be automated in the future # with a clear direction to the user to read the EULA at # https://developer.amd.com/aocc-compiler-eula/ @@ -74,11 +91,20 @@ deployment_groups: destination: /var/tmp/install_aocc.sh content: | #!/bin/bash + source /sw/spack/share/spack/setup-env.sh spack install aocc@3.2.0 +license-agreed spack load aocc@3.2.0 spack compiler find --scope site spack -d install -v openmpi@4.1.3 %aocc@3.2.0 +legacylaunchers +pmi schedulers=slurm + # must restart vm to re-initiate subsequent installs + - id: spack_builder + source: modules/compute/vm-instance + use: [network1, swfs, spack-startup] + settings: + name_prefix: spack-builder + machine_type: c2d-standard-16 + - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition use: @@ -118,6 +144,6 @@ deployment_groups: use: - network1 - slurm_controller - - spack-startup + - slurm_startup settings: machine_type: c2d-standard-4 From dc01b1dc49e0cc2d84623a44b849d58a66a4b8fe Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 16 Sep 2022 13:30:26 -0700 Subject: [PATCH 32/51] Enable ddn lustre client install with pre-existing-network-storage --- .../pre-existing-network-storage/README.md | 2 + .../pre-existing-network-storage/outputs.tf | 52 +++++++++++++++++++ .../ddn_exascaler_luster_client_install.tftpl | 44 ++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl diff --git a/modules/file-system/pre-existing-network-storage/README.md b/modules/file-system/pre-existing-network-storage/README.md index 3cf6faaab7..c68ba5590d 100644 --- a/modules/file-system/pre-existing-network-storage/README.md +++ b/modules/file-system/pre-existing-network-storage/README.md @@ -59,5 +59,7 @@ No resources. | Name | Description | |------|-------------| +| [client\_install\_runner](#output\_client\_install\_runner) | Runner that performs client installation needed to use file system. | +| [mount\_runner](#output\_mount\_runner) | Runner that mounts the file system. | | [network\_storage](#output\_network\_storage) | Describes a remote network storage to be mounted by fs-tab. | diff --git a/modules/file-system/pre-existing-network-storage/outputs.tf b/modules/file-system/pre-existing-network-storage/outputs.tf index 32b00ba317..c0aca68864 100644 --- a/modules/file-system/pre-existing-network-storage/outputs.tf +++ b/modules/file-system/pre-existing-network-storage/outputs.tf @@ -24,3 +24,55 @@ output "network_storage" { mount_options = var.mount_options } } + +locals { + # Client Install + ddn_lustre_client_install_script = templatefile( + "${path.module}/templates/ddn_exascaler_luster_client_install.tftpl", + { + server_ip = split("@", var.server_ip)[0] + remote_mount = var.remote_mount + local_mount = var.local_mount + } + ) + + install_scripts = { + "lustre" = local.ddn_lustre_client_install_script + } + + # Mounting + ddn_lustre_mount_cmd = "mount -t ${var.fs_type} ${var.server_ip}:/${var.remote_mount} ${var.local_mount}" + mount_commands = { + "lustre" = local.ddn_lustre_mount_cmd + } + + mount_script = <<-EOT + #!/bin/bash + findmnt --source ${var.server_ip}:/${var.remote_mount} --target ${var.local_mount} &> /dev/null + if [[ $? != 0 ]]; then + echo "Mounting --source ${var.server_ip}:/${var.remote_mount} --target ${var.local_mount}" + mkdir -p ${var.local_mount} + ${lookup(local.mount_commands, var.fs_type, "exit 1")} + else + echo "Skipping mounting source: ${var.server_ip}:/${var.remote_mount}, already mounted to target:${var.local_mount}" + fi + EOT +} + +output "client_install_runner" { + description = "Runner that performs client installation needed to use file system." + value = lookup(local.install_scripts, var.fs_type, null) == null ? null : { + "type" = "shell" + "content" = lookup(local.install_scripts, var.fs_type, "") + "destination" = "install_filesystem_client${replace(var.local_mount, "/", "_")}.sh" + } +} + +output "mount_runner" { + description = "Runner that mounts the file system." + value = lookup(local.mount_commands, var.fs_type, null) == null ? null : { + "type" = "shell" + "content" = local.mount_script + "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh" + } +} diff --git a/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl new file mode 100644 index 0000000000..ad25590d58 --- /dev/null +++ b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl @@ -0,0 +1,44 @@ +#!/bin/sh + +# Copyright 2022 DataDirect Networks +# Modifications Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Prior Art: https://github.com/DDNStorage/exascaler-cloud-terraform/blob/78deadbb2c1fa7e4603cf9605b0f7d1782117954/gcp/templates/client-script.tftpl + +# install new EXAScaler Cloud clients: +# all instances must be in the same zone +# and connected to the same network and subnet +# to set up EXAScaler Cloud filesystem on a new client instance, +# run the folowing commands on the client with root privileges: + +if [[ ! -z $(cat /proc/filesystems | grep lustre) ]]; then + echo "Skipping lustre client install as it is already supported" + exit 0 +fi + +cat >/etc/esc-client.conf< Date: Mon, 19 Sep 2022 09:47:08 -0500 Subject: [PATCH 33/51] Address dependency checker timeout failure A weekly run of the dependency checker test failed on the last stage of the test because it hit the 1800s timeout. There were no other failures. This commit increases the timeout to ensure failures are meaningful. --- tools/cloud-build/dependency-checks/hpc-toolkit-go-builder.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/dependency-checks/hpc-toolkit-go-builder.yaml b/tools/cloud-build/dependency-checks/hpc-toolkit-go-builder.yaml index e92512fdc5..b2b5f1bef5 100644 --- a/tools/cloud-build/dependency-checks/hpc-toolkit-go-builder.yaml +++ b/tools/cloud-build/dependency-checks/hpc-toolkit-go-builder.yaml @@ -14,7 +14,7 @@ --- -timeout: 1800s +timeout: 3600s steps: - name: golang:bullseye entrypoint: /bin/bash From 95680b1e6ff5f8d5dabdfa966d06d81e7b5c481e Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Mon, 19 Sep 2022 20:59:49 +0000 Subject: [PATCH 34/51] Add directory README to hybrid docs, update networks blueprint Addresses reviewer feedback (first pass) * Adds a new README in the hybrid-slurm-cluster directory that serves as a table of contents essentially * Updates create-networks.yaml to use the `subnetworks` field rather than setting the size and range based on the default primary subnet. * Add full path from the root of the repo for commands in the inter-gcp-project-hybrid-slurm document. * Various minor fixes --- docs/hybrid-slurm-cluster/README.md | 14 ++++++++++++ .../blueprints/create-networks.yaml | 12 ++++++---- .../inter-gcp-project-hybrid-slurm.md | 22 +++++++++++-------- 3 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 docs/hybrid-slurm-cluster/README.md diff --git a/docs/hybrid-slurm-cluster/README.md b/docs/hybrid-slurm-cluster/README.md new file mode 100644 index 0000000000..a1c8067147 --- /dev/null +++ b/docs/hybrid-slurm-cluster/README.md @@ -0,0 +1,14 @@ +# Hybrid Slurm Clusters + +## [inter-gcp-project-hybrid-slurm.md](./inter-gcp-project-hybrid-slurm.md) +This document describes how to deploy a simulated hybrid slurm cluster entirely +in GCP. These instructions can be used as a way of trying the +[schedmd-slurm-gcp-v5-hybrid](../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md) +in GCP before bringing the configuration changes to a physical on-premise slurm +cluster. + +## Blueprints +The [blueprints directory](./blueprints/) contains a set of support blueprints +for the documentation in this directory. These blueprints are intended to be +used as is with minimal tweaking of deployment variables either in place or on +the command line. diff --git a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml index db59df5cfd..b8f77a213f 100644 --- a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml @@ -34,16 +34,20 @@ deployment_groups: id: network0 settings: network_name: static-cluster-network - subnetwork_name: primary-subnet network_address_range: 10.0.0.0/16 - subnetwork_size: 0 + subnetworks: + - subnet_name: primary-subnet + subnet_region: $(vars.region) + new_bits: 8 - source: modules/network/vpc kind: terraform id: network1 settings: network_name: compute-vpc-network - subnetwork_name: primary-subnet project_id: $(vars.project_id_compute) network_address_range: 10.1.0.0/16 - subnetwork_size: 0 + subnetworks: + - subnet_name: primary-subnet + subnet_region: $(vars.region) + new_bits: 8 diff --git a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md index 858404b61b..5033204238 100644 --- a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md +++ b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md @@ -31,7 +31,7 @@ for the purposes of this all-GCP demo. The static cluster will be deployed with slurm-gcp and optionally have a set of statically created VMs populating it's local partition. -**hybrid deployment:** A deployment using the [schedmd-slurm-gcp-v5-hybrid] +**hybrid deployment:** A deployment using the [schedmd-slurm-gcp-v5-hybrid][hybridmodule] module. The deployment itself includes the hybrid configuration directory as well as metadata in the cloud bursting project. @@ -93,8 +93,13 @@ debian/ubuntu based images. Instructions for viewing these logs can be found in ### Connectivity Issues To verify the network and DNS peering setup was successful, you can create a VM in each project attached to the networks created in these instructions. You can -run ping `.c..internal` to verify the settings are -correct. This should succeed in both directions. +run ping to verify the settings are correct: + +```shell +.c..internal +``` + +This should succeed in both directions. If the ping test doesn’t work, the DNS may not be configured correctly, or the networks may not be able to peer correctly. If it’s the former, you should be @@ -104,7 +109,6 @@ or network peering setting are likely not correct. ## Instructions ### Before you begin -* Build ghpc #### Select or Create 2 GCP Projects @@ -146,7 +150,7 @@ command to install the pip packages outlined in [requirements.txt](./requirements.txt): ```shell -pip install -r requirements.txt +pip install -r docs/hybrid-slurm-cluster/requirements.txt ``` #### Build ghpc @@ -169,7 +173,7 @@ blueprint will do the following: Create a deployment directory for the networks using `ghpc`: ```shell -ghpc create blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" +ghpc create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" ``` If successful, this command will provide 3 terraform operations that can be @@ -299,7 +303,7 @@ First, use the HPC Toolkit to create the deployment directory, replacing "<>" with the ID of your project A: ```shell -ghpc create blueprints/static-cluster.yaml --vars project_id="<>" +ghpc create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" ``` If successful, this command will provide 3 terraform operations that can be @@ -347,7 +351,7 @@ If the deployment vars have been added directly to the blueprint, the following command will create the deployment directory: ```shell -ghpc create blueprints/hybrid-configuration.yaml +ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml ``` To create the deployment directory with deployment variables passed through the @@ -355,7 +359,7 @@ command line, run the following command with the updated values of `<>`, `<>` and `<>` instead: ```shell -ghpc create blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="<>.c.<>.internal" +ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="<>.c.<>.internal" ``` If successful, this command will provide 3 terraform operations that can be From c7b2bfa22d5a318205c77b218289affea74884a8 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 19 Sep 2022 14:07:05 -0700 Subject: [PATCH 35/51] Add install lustre from pre-existing-network-storage to integration tests --- .../blueprints/lustre-with-new-vpc.yaml | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml index 322b4d659b..c2b4a710b0 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml @@ -60,7 +60,7 @@ deployment_groups: - homefs - mount-exascaler settings: - name_prefix: test-workstation + name_prefix: test-workstation1 machine_type: c2-standard-4 - id: wait0 @@ -68,6 +68,36 @@ deployment_groups: settings: instance_name: ((module.workstation.name[0])) + # test installing luster from pre-existing-network-storage + - id: pre-fs + source: modules/file-system/pre-existing-network-storage + settings: + server_ip: ((module.scratchfs.network_storage.server_ip)) + remote_mount: ((module.scratchfs.network_storage.remote_mount)) + local_mount: ((module.scratchfs.network_storage.local_mount)) + fs_type: ((module.scratchfs.network_storage.fs_type)) + + - id: mount-exascaler-from-pre-existing + source: modules/scripts/startup-script + settings: + runners: + - $(pre-fs.client_install_runner) + - $(pre-fs.mount_runner) + + - id: install-luster-from-pre-existing + source: modules/compute/vm-instance + use: + - network1 + - mount-exascaler-from-pre-existing + settings: + name_prefix: test-workstation2 + machine_type: n2-standard-4 + + - id: wait1 + source: ./community/modules/scripts/wait-for-startup + settings: + instance_name: ((module.install-luster-from-pre-existing.name[0])) + - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition use: From 99bcc8f5fdaaf6716b3557f771c7dbb415897b51 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 19 Sep 2022 15:45:20 -0700 Subject: [PATCH 36/51] Add retry loop for installing DDN client setup tool --- .../templates/ddn_exascaler_luster_client_install.tftpl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl index ad25590d58..649abc4c4a 100644 --- a/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl +++ b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl @@ -39,6 +39,10 @@ cat >/etc/esc-client.conf< Date: Mon, 19 Sep 2022 15:49:34 -0700 Subject: [PATCH 37/51] Update pre-existing-network-storage output runners to have deterministic destination --- .../file-system/pre-existing-network-storage/outputs.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/file-system/pre-existing-network-storage/outputs.tf b/modules/file-system/pre-existing-network-storage/outputs.tf index c0aca68864..c38ce6b892 100644 --- a/modules/file-system/pre-existing-network-storage/outputs.tf +++ b/modules/file-system/pre-existing-network-storage/outputs.tf @@ -61,18 +61,18 @@ locals { output "client_install_runner" { description = "Runner that performs client installation needed to use file system." - value = lookup(local.install_scripts, var.fs_type, null) == null ? null : { + value = { "type" = "shell" - "content" = lookup(local.install_scripts, var.fs_type, "") + "content" = lookup(local.install_scripts, var.fs_type, "echo 'skipping: client_install_runner not yet supported for ${var.fs_type}'") "destination" = "install_filesystem_client${replace(var.local_mount, "/", "_")}.sh" } } output "mount_runner" { description = "Runner that mounts the file system." - value = lookup(local.mount_commands, var.fs_type, null) == null ? null : { + value = { "type" = "shell" - "content" = local.mount_script + "content" = (lookup(local.mount_commands, var.fs_type, null) == null ? "echo 'skipping: mount_runner not yet supported for ${var.fs_type}'" : local.mount_script) "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh" } } From 905a2c1dabf28bb427978054143296a1c8aa4a73 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Thu, 15 Sep 2022 21:24:51 +0000 Subject: [PATCH 38/51] Allow git:: as a valid source prefix --- pkg/sourcereader/sourcereader.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/sourcereader/sourcereader.go b/pkg/sourcereader/sourcereader.go index f5b796c207..e25c2eedac 100644 --- a/pkg/sourcereader/sourcereader.go +++ b/pkg/sourcereader/sourcereader.go @@ -58,7 +58,9 @@ func IsEmbeddedPath(source string) bool { // IsGitHubPath checks if a source path points to GitHub func IsGitHubPath(source string) bool { - return strings.HasPrefix(source, "github.com") || strings.HasPrefix(source, "git@github.com") + return strings.HasPrefix(source, "github.com") || + strings.HasPrefix(source, "git@github.com") || + strings.HasPrefix(source, "git::") } // Factory returns a SourceReader of module path From 443149e9680b58fecf28494e141651aff30a94d2 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Fri, 16 Sep 2022 21:07:57 +0000 Subject: [PATCH 39/51] Add documentation and tests for git:: source --- modules/README.md | 8 ++++++++ pkg/sourcereader/sourcereader_test.go | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/modules/README.md b/modules/README.md index 2cd9f0263b..ace4baa580 100644 --- a/modules/README.md +++ b/modules/README.md @@ -259,6 +259,14 @@ Toolkit vpc module, use: [tfsubdir]: https://www.terraform.io/language/modules/sources#modules-in-package-sub-directories [daos-cluster.yaml]: ../community/examples/intel/daos-cluster.yaml +#### Generic Git Modules +To use a Terraform module available in a non-GitHub git repository such as +gitlab, set the source to a path starting `git::`. Two Standard git protocols +are supported, `git::https://` for HTTPS or `git::git@github.com` for SSH. + +Additional formatting and features after `git::` are identical to that of the +[GitHub Modules](#github-modules) described above. + ### Kind (May be Required) `kind` refers to the way in which a module is deployed. Currently, `kind` can be diff --git a/pkg/sourcereader/sourcereader_test.go b/pkg/sourcereader/sourcereader_test.go index d3d2603cfd..9bf8bd01bd 100644 --- a/pkg/sourcereader/sourcereader_test.go +++ b/pkg/sourcereader/sourcereader_test.go @@ -127,6 +127,10 @@ func (s *MySuite) TestIsGitHubRepository(c *C) { // True, other ret = IsGitHubPath("github.com/modules") c.Assert(ret, Equals, true) + + // True, genetic git repository + ret = IsGitHubPath("git::https://gitlab.com/modules") + c.Assert(ret, Equals, true) } func (s *MySuite) TestFactory(c *C) { @@ -141,6 +145,10 @@ func (s *MySuite) TestFactory(c *C) { // GitHub modules ghSrcString := Factory("github.com/modules") c.Assert(reflect.TypeOf(ghSrcString), Equals, reflect.TypeOf(GitHubSourceReader{})) + + // Git modules + gitSrcString := Factory("git::https://gitlab.com/modules") + c.Assert(reflect.TypeOf(gitSrcString), Equals, reflect.TypeOf(GitHubSourceReader{})) } func (s *MySuite) TestCopyFromPath(c *C) { From a8020857d1bd6436fdbd7af93f979b36e50c7000 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Mon, 19 Sep 2022 21:58:04 +0000 Subject: [PATCH 40/51] Change github source to general git --- pkg/modulewriter/modulewriter.go | 2 +- pkg/modulewriter/tfwriter.go | 2 +- pkg/sourcereader/{github.go => git.go} | 26 +++++++++---------- .../{github_test.go => git_test.go} | 24 ++++++++--------- pkg/sourcereader/sourcereader.go | 8 +++--- pkg/sourcereader/sourcereader_test.go | 18 ++++++------- 6 files changed, 40 insertions(+), 40 deletions(-) rename pkg/sourcereader/{github.go => git.go} (73%) rename pkg/sourcereader/{github_test.go => git_test.go} (75%) diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index 12fe382017..f8019c3505 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -115,7 +115,7 @@ func WriteDeployment(blueprint *config.Blueprint, outputDir string, overwriteFla func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGroup) { for iGrp, grp := range *deploymentGroups { for iMod, module := range grp.Modules { - if sourcereader.IsGitHubPath(module.Source) { + if sourcereader.IsGitPath(module.Source) { continue } diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index 29fc5d5032..700fb89d32 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -236,7 +236,7 @@ func writeMain( // Add source attribute var moduleSource cty.Value - if sourcereader.IsGitHubPath(mod.Source) { + if sourcereader.IsGitPath(mod.Source) { moduleSource = cty.StringVal(mod.Source) } else { moduleSource = cty.StringVal(fmt.Sprintf("./modules/%s", mod.ModuleName)) diff --git a/pkg/sourcereader/github.go b/pkg/sourcereader/git.go similarity index 73% rename from pkg/sourcereader/github.go rename to pkg/sourcereader/git.go index cc5588c013..7da369ca5a 100644 --- a/pkg/sourcereader/github.go +++ b/pkg/sourcereader/git.go @@ -37,10 +37,10 @@ var goGetterGetters = map[string]getter.Getter{ var goGetterDecompressors = map[string]getter.Decompressor{} -// GitHubSourceReader reads modules from a GitHub repository -type GitHubSourceReader struct{} +// GitSourceReader reads modules from a git repository +type GitSourceReader struct{} -func copyGitHubModules(srcPath string, destPath string) error { +func copyGitModules(srcPath string, destPath string) error { client := getter.Client{ Src: srcPath, Dst: destPath, @@ -57,9 +57,9 @@ func copyGitHubModules(srcPath string, destPath string) error { return err } -// GetModuleInfo gets modulereader.ModuleInfo for the given kind from the GitHub source -func (r GitHubSourceReader) GetModuleInfo(modPath string, kind string) (modulereader.ModuleInfo, error) { - if !IsGitHubPath(modPath) { +// GetModuleInfo gets modulereader.ModuleInfo for the given kind from the git source +func (r GitSourceReader) GetModuleInfo(modPath string, kind string) (modulereader.ModuleInfo, error) { + if !IsGitPath(modPath) { return modulereader.ModuleInfo{}, fmt.Errorf("Source is not valid: %s", modPath) } @@ -70,8 +70,8 @@ func (r GitHubSourceReader) GetModuleInfo(modPath string, kind string) (modulere return modulereader.ModuleInfo{}, err } - if err := copyGitHubModules(modPath, writeDir); err != nil { - return modulereader.ModuleInfo{}, fmt.Errorf("failed to clone GitHub module at %s to tmp dir %s: %v", + if err := copyGitModules(modPath, writeDir); err != nil { + return modulereader.ModuleInfo{}, fmt.Errorf("failed to clone git module at %s to tmp dir %s: %v", modPath, writeDir, err) } @@ -79,9 +79,9 @@ func (r GitHubSourceReader) GetModuleInfo(modPath string, kind string) (modulere return reader.GetInfo(writeDir) } -// GetModule copies the GitHub source to a provided destination (the deployment directory) -func (r GitHubSourceReader) GetModule(modPath string, copyPath string) error { - if !IsGitHubPath(modPath) { +// GetModule copies the git source to a provided destination (the deployment directory) +func (r GitSourceReader) GetModule(modPath string, copyPath string) error { + if !IsGitPath(modPath) { return fmt.Errorf("Source is not valid: %s", modPath) } @@ -92,8 +92,8 @@ func (r GitHubSourceReader) GetModule(modPath string, copyPath string) error { return err } - if err := copyGitHubModules(modPath, writeDir); err != nil { - return fmt.Errorf("failed to clone GitHub module at %s to tmp dir %s: %v", + if err := copyGitModules(modPath, writeDir); err != nil { + return fmt.Errorf("failed to clone git module at %s to tmp dir %s: %v", modPath, writeDir, err) } diff --git a/pkg/sourcereader/github_test.go b/pkg/sourcereader/git_test.go similarity index 75% rename from pkg/sourcereader/github_test.go rename to pkg/sourcereader/git_test.go index 7e99773c20..97446268ea 100644 --- a/pkg/sourcereader/github_test.go +++ b/pkg/sourcereader/git_test.go @@ -22,16 +22,16 @@ import ( . "gopkg.in/check.v1" ) -func (s *MySuite) TestCopyGitHubModules(c *C) { +func (s *MySuite) TestCopyGitModules(c *C) { // Setup - destDir := filepath.Join(testDir, "TestCopyGitHubRepository") + destDir := filepath.Join(testDir, "TestCopyGitRepository") if err := os.Mkdir(destDir, 0755); err != nil { log.Fatal(err) } // Success via HTTPS destDirForHTTPS := filepath.Join(destDir, "https") - err := copyGitHubModules("github.com/terraform-google-modules/terraform-google-project-factory//helpers", destDirForHTTPS) + err := copyGitModules("github.com/terraform-google-modules/terraform-google-project-factory//helpers", destDirForHTTPS) c.Assert(err, IsNil) fInfo, err := os.Stat(filepath.Join(destDirForHTTPS, "terraform_validate")) c.Assert(err, IsNil) @@ -41,7 +41,7 @@ func (s *MySuite) TestCopyGitHubModules(c *C) { // Success via HTTPS (Root directory) destDirForHTTPSRootDir := filepath.Join(destDir, "https-rootdir") - err = copyGitHubModules("github.com/terraform-google-modules/terraform-google-service-accounts.git?ref=v4.1.1", destDirForHTTPSRootDir) + err = copyGitModules("github.com/terraform-google-modules/terraform-google-service-accounts.git?ref=v4.1.1", destDirForHTTPSRootDir) c.Assert(err, IsNil) fInfo, err = os.Stat(filepath.Join(destDirForHTTPSRootDir, "main.tf")) c.Assert(err, IsNil) @@ -50,13 +50,13 @@ func (s *MySuite) TestCopyGitHubModules(c *C) { c.Assert(fInfo.IsDir(), Equals, false) } -func (s *MySuite) TestGetModuleInfo_GitHub(c *C) { - reader := GitHubSourceReader{} +func (s *MySuite) TestGetModuleInfo_Git(c *C) { + reader := GitSourceReader{} - // Invalid GitHub repository - path does not exists + // Invalid git repository - path does not exists badGitRepo := "github.com:not/exist.git" _, err := reader.GetModuleInfo(badGitRepo, tfKindString) - expectedErr := "failed to clone GitHub module at .*" + expectedErr := "failed to clone git module at .*" c.Assert(err, ErrorMatches, expectedErr) // Invalid: Unsupported Module Source @@ -66,13 +66,13 @@ func (s *MySuite) TestGetModuleInfo_GitHub(c *C) { c.Assert(err, ErrorMatches, expectedErr) } -func (s *MySuite) TestGetModule_GitHub(c *C) { - reader := GitHubSourceReader{} +func (s *MySuite) TestGetModule_Git(c *C) { + reader := GitSourceReader{} - // Invalid GitHub repository - path does not exists + // Invalid git repository - path does not exists badGitRepo := "github.com:not/exist.git" err := reader.GetModule(badGitRepo, tfKindString) - expectedErr := "failed to clone GitHub module at .*" + expectedErr := "failed to clone git module at .*" c.Assert(err, ErrorMatches, expectedErr) // Invalid: Unsupported Module Source diff --git a/pkg/sourcereader/sourcereader.go b/pkg/sourcereader/sourcereader.go index e25c2eedac..880a378a8b 100644 --- a/pkg/sourcereader/sourcereader.go +++ b/pkg/sourcereader/sourcereader.go @@ -41,7 +41,7 @@ type SourceReader interface { var readers = map[int]SourceReader{ local: LocalSourceReader{}, embedded: EmbeddedSourceReader{}, - github: GitHubSourceReader{}, + github: GitSourceReader{}, } // IsLocalPath checks if a source path is a local FS path @@ -56,8 +56,8 @@ func IsEmbeddedPath(source string) bool { return strings.HasPrefix(source, "modules/") || strings.HasPrefix(source, "community/modules/") } -// IsGitHubPath checks if a source path points to GitHub -func IsGitHubPath(source string) bool { +// IsGitPath checks if a source path points to GitHub or has the git:: prefix +func IsGitPath(source string) bool { return strings.HasPrefix(source, "github.com") || strings.HasPrefix(source, "git@github.com") || strings.HasPrefix(source, "git::") @@ -75,7 +75,7 @@ func Factory(modPath string) SourceReader { return readers[local] case IsEmbeddedPath(modPath): return readers[embedded] - case IsGitHubPath(modPath): + case IsGitPath(modPath): return readers[github] default: log.Fatalf( diff --git a/pkg/sourcereader/sourcereader_test.go b/pkg/sourcereader/sourcereader_test.go index 9bf8bd01bd..489a5df983 100644 --- a/pkg/sourcereader/sourcereader_test.go +++ b/pkg/sourcereader/sourcereader_test.go @@ -109,27 +109,27 @@ func (s *MySuite) TestIsLocalPath(c *C) { c.Assert(ret, Equals, false) } -func (s *MySuite) TestIsGitHubRepository(c *C) { +func (s *MySuite) TestIsGitRepository(c *C) { // False: Is an embedded path - ret := IsGitHubPath("modules/anything/else") + ret := IsGitPath("modules/anything/else") c.Assert(ret, Equals, false) // False: Local path - ret = IsGitHubPath("./anything/else") + ret = IsGitPath("./anything/else") c.Assert(ret, Equals, false) - ret = IsGitHubPath("./modules") + ret = IsGitPath("./modules") c.Assert(ret, Equals, false) - ret = IsGitHubPath("../modules/") + ret = IsGitPath("../modules/") c.Assert(ret, Equals, false) // True, other - ret = IsGitHubPath("github.com/modules") + ret = IsGitPath("github.com/modules") c.Assert(ret, Equals, true) // True, genetic git repository - ret = IsGitHubPath("git::https://gitlab.com/modules") + ret = IsGitPath("git::https://gitlab.com/modules") c.Assert(ret, Equals, true) } @@ -144,11 +144,11 @@ func (s *MySuite) TestFactory(c *C) { // GitHub modules ghSrcString := Factory("github.com/modules") - c.Assert(reflect.TypeOf(ghSrcString), Equals, reflect.TypeOf(GitHubSourceReader{})) + c.Assert(reflect.TypeOf(ghSrcString), Equals, reflect.TypeOf(GitSourceReader{})) // Git modules gitSrcString := Factory("git::https://gitlab.com/modules") - c.Assert(reflect.TypeOf(gitSrcString), Equals, reflect.TypeOf(GitHubSourceReader{})) + c.Assert(reflect.TypeOf(gitSrcString), Equals, reflect.TypeOf(GitSourceReader{})) } func (s *MySuite) TestCopyFromPath(c *C) { From 043b3237f2378c6e406f7d45c327e4f24b9781bb Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Mon, 19 Sep 2022 22:58:01 +0000 Subject: [PATCH 41/51] Ensure deployment group directory is created Creation of the group level directory was skipped previously, as it was handled in non-git sources automatically. With git sources, it needs to be created, so this commit adds that logic before handling modules directly in copySource. --- pkg/modulewriter/modulewriter.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index f8019c3505..c71dbe5ddd 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -19,6 +19,7 @@ package modulewriter import ( "embed" + "errors" "fmt" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/deploymentio" @@ -79,7 +80,9 @@ func WriteDeployment(blueprint *config.Blueprint, outputDir string, overwriteFla return err } - copySource(deploymentDir, &blueprint.DeploymentGroups) + if err := copySource(deploymentDir, &blueprint.DeploymentGroups); err != nil { + return err + } for _, grp := range blueprint.DeploymentGroups { @@ -112,8 +115,17 @@ func WriteDeployment(blueprint *config.Blueprint, outputDir string, overwriteFla return nil } -func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGroup) { +func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGroup) error { + for iGrp, grp := range *deploymentGroups { + basePath := filepath.Join(deploymentPath, grp.Name) + // Create the deployment group directory if not already created. + if _, err := os.Stat(basePath); errors.Is(err, os.ErrNotExist) { + deploymentio := deploymentio.GetDeploymentioLocal() + if err := deploymentio.CreateDirectory(basePath); err != nil { + return fmt.Errorf("failed to create directory at %s for deployment group %s: err=%w", basePath, grp.Name, err) + } + } for iMod, module := range grp.Modules { if sourcereader.IsGitPath(module.Source) { continue @@ -122,7 +134,6 @@ func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGrou /* Copy source files */ moduleName := filepath.Base(module.Source) (*deploymentGroups)[iGrp].Modules[iMod].ModuleName = moduleName - basePath := filepath.Join(deploymentPath, grp.Name) var destPath string switch module.Kind { case "terraform": @@ -137,7 +148,7 @@ func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGrou reader := sourcereader.Factory(module.Source) if err := reader.GetModule(module.Source, destPath); err != nil { - log.Fatalf("failed to get module from %s to %s: %v", module.Source, destPath, err) + return fmt.Errorf("failed to get module from %s to %s: %v", module.Source, destPath, err) } /* Create module level files */ @@ -145,6 +156,7 @@ func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGrou writer.addNumModules(1) } } + return nil } func printInstructionsPreamble(kind string, path string) { From bd73407a9e38c0d532321481d004e4079a64b198 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Tue, 20 Sep 2022 21:14:03 +0000 Subject: [PATCH 42/51] Various fixes, change in cluster name Changes the deployment_name and therefore slurm_cluster_name so that the hostname of the controller will be cleaner looking and allow us to include the name directly in the instructions. In addition, other reviewer feedback is addressed by this commit. --- .../blueprints/static-cluster.yaml | 2 +- .../inter-gcp-project-hybrid-slurm.md | 61 ++++++++++--------- docs/hybrid-slurm-cluster/requirements.txt | 5 +- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml index 9cc3f35a73..162850527f 100644 --- a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml @@ -18,7 +18,7 @@ blueprint_name: static-slurm-cluster vars: project_id: ## <> - deployment_name: static-slurm-cluster + deployment_name: cluster region: us-central1 zone: us-central1-c diff --git a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md index 5033204238..1460c42167 100644 --- a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md +++ b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md @@ -128,9 +128,11 @@ The following APIs are required to complete this demo: * [Compute Engine API][computeapi] * [Cloud DNS API][clouddnsapi] +* [Filestore API][fileapi] [computeapi]: https://cloud.google.com/compute/docs/reference/rest/v1 [clouddnsapi]: https://cloud.google.com/dns/docs/reference/v1 +[fileapi]: https://cloud.google.com/filestore/docs/reference/rest #### Set IAM Roles The service account attaches to the slurm controller in Project A @@ -158,6 +160,15 @@ pip install -r docs/hybrid-slurm-cluster/requirements.txt Before you begin, ensure that you have built the `ghpc` tool in the HPC Toolkit. For more information see the [README.md](../../README.md#quickstart) Quickstart. +The commands in these instructions assume the ghpc binary is installed in a +directory represented in the PATH environment variable. To ensure this is the +case, run `make install` after building `ghpc`: + +```shell +make +make install +``` + ### Create VPC Networks A blueprint for creating VPC networks in each project that can support network and DNS peering can be found at [create-networks.yaml]. This @@ -313,9 +324,9 @@ following: ```shell Terraform group was successfully created in directory peering-networks-demo/primary To deploy, run the following commands: - terraform -chdir=static-slurm-cluster/primary init - terraform -chdir=static-slurm-cluster/primary validate - terraform -chdir=static-slurm-cluster/primary apply + terraform -chdir=cluster/primary init + terraform -chdir=cluster/primary validate + terraform -chdir=cluster/primary apply ``` Execute the terraform commands to deploy the static Slurm cluster in project A. @@ -339,27 +350,20 @@ a deployment that does the following: configure themselves. * Create pubsub actions triggered by changes to the hybrid configuration. -Either in the blueprint directly or on the command line, update the following -deployment variables in the [hybrid-configuration.yaml] blueprint: +The following deployment variables in the [hybrid-configuration.yaml] blueprint +will be set based on your configuration via the command line: * **_project\_id:_** The ID of project B. * **_static\_controller\_hostname:_** The fully qualified internal hostname of the static cluster's controller in project A. The format is - `<>.c.<>.internal`. - -If the deployment vars have been added directly to the blueprint, the following -command will create the deployment directory: - -```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml -``` + `cluster-controller.c.<>.internal`. To create the deployment directory with deployment variables passed through the -command line, run the following command with the updated values of -`<>`, `<>` and `<>` instead: +command line, run the following command with the updated values for +`<>` and `<>`: ```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="<>.c.<>.internal" +ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="cluster-controller.c.<>.internal" ``` If successful, this command will provide 3 terraform operations that can be @@ -399,14 +403,14 @@ Copy the `hybrid.tar.gz` file to the controller VM instance. This can be done in whichever way is easiest for you, `gcloud compute scp` is used here. ```shell -gcloud compute scp --project="<>" --zone=us-central1-c ./hybrid.tar.gz "<>:~" +gcloud compute scp --project="<>" --zone=us-central1-c ./hybrid.tar.gz "cluster-controller:~" ``` Now SSH to the controller VM either using the console or the following gcloud command: ```shell -gcloud compute ssh --project="<>" --zone=us-central1-c "<>" +gcloud compute ssh --project="<>" --zone=us-central1-c "cluster-controller" ``` Decompress the `hybrid.tar.gz` file: @@ -432,11 +436,11 @@ lines that need to be copied will look similar to the following block: ```text NodeName=DEFAULT State=UNKNOWN RealMemory=7552 Boards=1 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 CPUs=1 -NodeName=staticslur-static-ghpc-[0-3] State=CLOUD -NodeSet=staticslur-static-ghpc Nodes=staticslur-static-ghpc-[0-3] -PartitionName=static Nodes=staticslur-static-ghpc State=UP DefMemPerCPU=7552 SuspendTime=300 Oversubscribe=Exclusive Default=YES +NodeName=cluster-static-ghpc-[0-3] State=CLOUD +NodeSet=cluster-static-ghpc Nodes=cluster-static-ghpc-[0-3] +PartitionName=static Nodes=cluster-static-ghpc State=UP DefMemPerCPU=7552 SuspendTime=300 Oversubscribe=Exclusive Default=YES -SuspendExcNodes=staticslur-static-ghpc-[0-3] +SuspendExcNodes=cluster-static-ghpc-[0-3] ``` Depending on the configuration of the static partitions, the `SuspendExcNodes` @@ -453,7 +457,7 @@ Make the following changes to the `/etc/slurm/slurm.conf` file: ```text # slurm.conf ... -SlurmctldHost=<>(<>.c.<>.internal) +SlurmctldHost=cluster-controller(cluster-controller.c.<>.internal) ... include hybrid/cloud.conf ... @@ -517,8 +521,9 @@ side: ```shell $ sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST -static* up infinite 4 idle staticslur-static-ghpc-[0-3] -cloud up infinite 10 idle~ hybridconf-cloud-ghpc-[0-9] +static* up infinite 4 idle cluster-static-ghpc-[0-3] +compute up infinite 20 idle~ hybridconf-compute-ghpc-[0-19] +debug up infinite 10 idle~ hybridconf-debug-ghpc-[0-9] ``` To verify that your local partitions are still active, run a simple test with @@ -526,7 +531,7 @@ To verify that your local partitions are still active, run a simple test with ```shell $ srun -N 1 hostname -staticslur-static-ghpc-0 +cluster-static-ghpc-0 ``` Now verify the cloud partition is running with a similar test. Note that since a @@ -535,6 +540,6 @@ Subsequent uses of the cloud nodes before being suspended will be near instantaneous after the initial startup cost. ```shell -$ srun -N 1 -p cloud hostname -hybridconf-cloud-ghpc-0 +$ srun -N 1 -p debug hostname +hybridconf-debug-ghpc-0 ``` diff --git a/docs/hybrid-slurm-cluster/requirements.txt b/docs/hybrid-slurm-cluster/requirements.txt index ae1919a138..a99bcfa21e 100644 --- a/docs/hybrid-slurm-cluster/requirements.txt +++ b/docs/hybrid-slurm-cluster/requirements.txt @@ -1,2 +1,5 @@ addict~=2.0 -google-cloud-pubsub~=2.0 \ No newline at end of file +google-cloud-pubsub~=2.0 +google-api-python-client==2.61.0 +httplib2==0.20.4 +PyYAML==6.0 \ No newline at end of file From 9d4712ceb6f0318b81c56a2b3b77c69a4927aed2 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Tue, 20 Sep 2022 18:53:51 +0000 Subject: [PATCH 43/51] Break out group directory creation into function Breaks the group directory creation into a separate function from copySource. This allow for better testing and better separation of tasks. Tests have been written to ensure directories are created correctly. --- pkg/modulewriter/modulewriter.go | 25 ++++++++---- pkg/modulewriter/modulewriter_test.go | 58 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index c71dbe5ddd..9f6e629e36 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -84,6 +84,10 @@ func WriteDeployment(blueprint *config.Blueprint, outputDir string, overwriteFla return err } + if err := createGroupDirs(deploymentDir, &blueprint.DeploymentGroups); err != nil { + return err + } + for _, grp := range blueprint.DeploymentGroups { deploymentName, err := blueprint.DeploymentName() @@ -115,17 +119,24 @@ func WriteDeployment(blueprint *config.Blueprint, outputDir string, overwriteFla return nil } +func createGroupDirs(deploymentPath string, deploymentGroups *[]config.DeploymentGroup) error { + for _, grp := range *deploymentGroups { + groupPath := filepath.Join(deploymentPath, grp.Name) + // Create the deployment group directory if not already created. + if _, err := os.Stat(groupPath); errors.Is(err, os.ErrNotExist) { + if err := os.Mkdir(groupPath, 0755); err != nil { + return fmt.Errorf("failed to create directory at %s for deployment group %s: err=%w", + groupPath, grp.Name, err) + } + } + } + return nil +} + func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGroup) error { for iGrp, grp := range *deploymentGroups { basePath := filepath.Join(deploymentPath, grp.Name) - // Create the deployment group directory if not already created. - if _, err := os.Stat(basePath); errors.Is(err, os.ErrNotExist) { - deploymentio := deploymentio.GetDeploymentioLocal() - if err := deploymentio.CreateDirectory(basePath); err != nil { - return fmt.Errorf("failed to create directory at %s for deployment group %s: err=%w", basePath, grp.Name, err) - } - } for iMod, module := range grp.Modules { if sourcereader.IsGitPath(module.Source) { continue diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index d202b8fb70..75c7e65294 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -225,6 +225,64 @@ func (s *MySuite) TestWriteDeployment(c *C) { c.Check(err, IsNil) } +func (s *MySuite) TestCreateGroupDirs(c *C) { + // Setup + testDeployDir := filepath.Join(testDir, "test_createGroupDirs") + if err := os.Mkdir(testDeployDir, 0755); err != nil { + log.Fatal("Failed to create test deployment directory for createGroupDirs") + } + groupNames := []string{"group0", "group1", "group2"} + + // No deployment groups + testDepGroups := []config.DeploymentGroup{} + err := createGroupDirs(testDeployDir, &testDepGroups) + c.Check(err, IsNil) + + // Single deployment group + testDepGroups = []config.DeploymentGroup{{Name: groupNames[0]}} + err = createGroupDirs(testDeployDir, &testDepGroups) + c.Check(err, IsNil) + grp0Path := filepath.Join(testDeployDir, groupNames[0]) + _, err = os.Stat(grp0Path) + c.Check(errors.Is(err, os.ErrNotExist), Equals, false) + c.Check(err, IsNil) + err = os.Remove(grp0Path) + c.Check(err, IsNil) + + // Multiple deployment groups + testDepGroups = []config.DeploymentGroup{ + {Name: groupNames[0]}, + {Name: groupNames[1]}, + {Name: groupNames[2]}, + } + err = createGroupDirs(testDeployDir, &testDepGroups) + c.Check(err, IsNil) + // Check for group 0 + _, err = os.Stat(grp0Path) + c.Check(errors.Is(err, os.ErrNotExist), Equals, false) + c.Check(err, IsNil) + err = os.Remove(grp0Path) + c.Check(err, IsNil) + // Check for group 1 + grp1Path := filepath.Join(testDeployDir, groupNames[1]) + _, err = os.Stat(grp1Path) + c.Check(errors.Is(err, os.ErrNotExist), Equals, false) + c.Check(err, IsNil) + err = os.Remove(grp1Path) + c.Check(err, IsNil) + // Check for group 2 + grp2Path := filepath.Join(testDeployDir, groupNames[2]) + _, err = os.Stat(grp2Path) + c.Check(errors.Is(err, os.ErrNotExist), Equals, false) + c.Check(err, IsNil) + err = os.Remove(grp2Path) + c.Check(err, IsNil) + + // deployment group(s) already exists + err = createGroupDirs(testDeployDir, &testDepGroups) + c.Check(err, IsNil) +} + func (s *MySuite) TestWriteDeployment_BadDeploymentName(c *C) { testBlueprint := getBlueprintForTest() var e *config.InputValueError From 15d7f0220d4804ba08807ef5866312f92eceb522 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 20 Sep 2022 15:57:32 -0500 Subject: [PATCH 44/51] Use fully qualified Ansible resource names --- community/modules/file-system/nfs-server/scripts/mount.yaml | 4 ++-- modules/file-system/filestore/scripts/mount.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/file-system/nfs-server/scripts/mount.yaml b/community/modules/file-system/nfs-server/scripts/mount.yaml index b39a2f4adb..f7fbe58d5e 100644 --- a/community/modules/file-system/nfs-server/scripts/mount.yaml +++ b/community/modules/file-system/nfs-server/scripts/mount.yaml @@ -22,14 +22,14 @@ url: "http://metadata.google.internal/computeMetadata/v1/instance/attributes" tasks: - name: Read metadata network_storage information - uri: + ansible.builtin.uri: url: "{{ url }}/{{ meta_key }}" method: GET headers: Metadata-Flavor: "Google" register: storage - name: Mount file systems - mount: + ansible.posix.mount: src: "{{ item.server_ip }}:/{{ item.remote_mount }}" path: "{{ item.local_mount }}" opts: "{{ item.mount_options }}" diff --git a/modules/file-system/filestore/scripts/mount.yaml b/modules/file-system/filestore/scripts/mount.yaml index b39a2f4adb..f7fbe58d5e 100644 --- a/modules/file-system/filestore/scripts/mount.yaml +++ b/modules/file-system/filestore/scripts/mount.yaml @@ -22,14 +22,14 @@ url: "http://metadata.google.internal/computeMetadata/v1/instance/attributes" tasks: - name: Read metadata network_storage information - uri: + ansible.builtin.uri: url: "{{ url }}/{{ meta_key }}" method: GET headers: Metadata-Flavor: "Google" register: storage - name: Mount file systems - mount: + ansible.posix.mount: src: "{{ item.server_ip }}:/{{ item.remote_mount }}" path: "{{ item.local_mount }}" opts: "{{ item.mount_options }}" From 4b5874f208f92fa3eeec11c45ba81bc24a77ce13 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 21 Sep 2022 09:51:19 -0500 Subject: [PATCH 45/51] Fix fully qualified name for Ansible resource --- .../modules/scripts/omnia-install/templates/install_omnia.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scripts/omnia-install/templates/install_omnia.tpl b/community/modules/scripts/omnia-install/templates/install_omnia.tpl index 5989e8f9b1..6164c95ebf 100644 --- a/community/modules/scripts/omnia-install/templates/install_omnia.tpl +++ b/community/modules/scripts/omnia-install/templates/install_omnia.tpl @@ -29,7 +29,7 @@ mode: 0700 owner: "{{ username }}" - name: Create keys - ansible.builtin.openssh_keypair: + community.crypto.openssh_keypair: path: "{{ pub_key_file }}" owner: "{{ username }}" - name: Copy public key to authorized keys From 0e4d36c757345ad4e45fbf65e69bd10077d648e5 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 22 Sep 2022 15:07:21 -0500 Subject: [PATCH 46/51] Perform regular cleanup of Filestore VPC network peerings --- tools/clean-filestore-limit.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/clean-filestore-limit.sh b/tools/clean-filestore-limit.sh index 988e1271cf..b8cb03d48c 100755 --- a/tools/clean-filestore-limit.sh +++ b/tools/clean-filestore-limit.sh @@ -27,6 +27,22 @@ ACTIVE_FILESTORE=$(gcloud filestore instances list --project "${PROJECT_ID}" --f if [[ -z "$ACTIVE_BUILDS" && -z "$ACTIVE_FILESTORE" ]]; then echo "Disabling Filestore API..." gcloud services disable file.googleapis.com --force --project "${PROJECT_ID}" + + echo "Deleting all Filestore peering networks" + peerings=$(gcloud compute networks peerings list --project "${PROJECT_ID}" --format="value(peerings.name,name)") + while read -r peering; do + parr=("$peering") + IFS=";" read -ra peers <<<"${parr[0]}" + network=${parr[1]} + + for peer in "${peers[@]}"; do + if [[ "$peer" =~ ^filestore-peer-[0-9]+$ ]]; then + echo "Deleting $peer from $network" + gcloud --project "${PROJECT_ID}" compute networks peerings delete --network "$network" "$peer" + fi + done + done <<<"$peerings" + echo "Re-enabling Filestore API..." gcloud services enable file.googleapis.com --project "${PROJECT_ID}" echo "Re-enabled Filestore API..." From 31ae677e8f810927bc70ca3d3cc88f63bedde221 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Sun, 25 Sep 2022 22:25:07 -0500 Subject: [PATCH 47/51] Fix filestore peering network cleanup script --- tools/clean-filestore-limit.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/clean-filestore-limit.sh b/tools/clean-filestore-limit.sh index b8cb03d48c..8b8df56d5b 100755 --- a/tools/clean-filestore-limit.sh +++ b/tools/clean-filestore-limit.sh @@ -29,10 +29,17 @@ if [[ -z "$ACTIVE_BUILDS" && -z "$ACTIVE_FILESTORE" ]]; then gcloud services disable file.googleapis.com --force --project "${PROJECT_ID}" echo "Deleting all Filestore peering networks" + # the output of this command matches + # filestore-peer-426414172628;filestore-peer-646290499454 default peerings=$(gcloud compute networks peerings list --project "${PROJECT_ID}" --format="value(peerings.name,name)") while read -r peering; do - parr=("$peering") + # split the output into: + # 0: a semi-colon separated list of peerings + # 1: the name of a VPC network + read -ra parr <<<"$peering" + # split the list of peerings into an array IFS=";" read -ra peers <<<"${parr[0]}" + # capture the VPC network network=${parr[1]} for peer in "${peers[@]}"; do From 2538146bcfd2899bed0b27dab51e0e8d19b2f6ce Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Sun, 25 Sep 2022 22:36:17 -0500 Subject: [PATCH 48/51] Remove default URLs from Spack tutorials --- docs/tutorials/gromacs/spack-gromacs.yaml | 1 - docs/tutorials/openfoam/spack-openfoam.yaml | 1 - docs/tutorials/wrfv3/spack-wrfv3.yaml | 1 - 3 files changed, 3 deletions(-) diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index 56da1f3bc1..cbbba9f9d0 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -37,7 +37,6 @@ deployment_groups: source: community/modules/scripts/spack-install settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index ed1de2ce09..aa908cbd89 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -37,7 +37,6 @@ deployment_groups: source: community/modules/scripts/spack-install settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index d37cfd80a0..96b5ee7eb6 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -37,7 +37,6 @@ deployment_groups: source: community/modules/scripts/spack-install settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: From 60265f1b01c15d1bc6784f8c5c362bbedb58cbb0 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 26 Sep 2022 09:07:37 -0500 Subject: [PATCH 49/51] Avoid spurious errors in Spack log - do not attempt to re-create a Spack environment that already exists --- .../modules/scripts/spack-install/templates/install_spack.tpl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index b8ddb7975b..085a23e76e 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -99,6 +99,7 @@ echo "$PREFIX Installing root spack specs..." echo "$PREFIX Configuring spack environments" %{if ENVIRONMENTS != null ~} %{for e in ENVIRONMENTS ~} +if [ ! -d ${INSTALL_DIR}/var/spack/environments/${e.name} ]; then %{if e.content != null} { cat << 'EOF' > ${INSTALL_DIR}/spack_env.yaml @@ -129,6 +130,7 @@ EOF spack env deactivate >> ${LOG_FILE} 2>&1 spack clean -s >> ${LOG_FILE} 2>&1 +fi %{endfor ~} %{endif ~} From 58b3dd5aee6e5402c5959f167938a99e20aef28b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 29 Sep 2022 10:32:04 -0500 Subject: [PATCH 50/51] Fix Ansible module for upgrading setuptools in HTCondor autoscaler --- .../files/install-htcondor-autoscaler-deps.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml b/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml index 0351dfd75f..db989f9d40 100644 --- a/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml +++ b/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml @@ -24,13 +24,13 @@ - name: Create virtual environment for HTCondor autoscaler ansible.builtin.pip: name: pip - version: 21.3.1 # last Python 2.7-compatible release + version: 21.3.1 # last Python 3.6-compatible release virtualenv: /usr/local/htcondor virtualenv_command: /usr/bin/python3 -m venv - name: Install latest setuptools ansible.builtin.pip: name: setuptools - state: 44.1.1 # last Python 2.7-compatible release + version: 59.6.0 # last Python 3.6-compatible release virtualenv: /usr/local/htcondor virtualenv_command: /usr/bin/python3 -m venv - name: Install HTCondor autoscaler dependencies From 75181313bacf3eddd211b37759d1ddb780afb434 Mon Sep 17 00:00:00 2001 From: Alex Heye Date: Tue, 4 Oct 2022 21:10:34 +0000 Subject: [PATCH 51/51] Update version to 1.6.0 --- cmd/root.go | 2 +- .../compute/SchedMD-slurm-on-gcp-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/project/service-enablement/versions.tf | 2 +- .../scheduler/SchedMD-slurm-on-gcp-controller/versions.tf | 2 +- .../scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf | 2 +- .../modules/scheduler/cloud-batch-login-node/versions.tf | 2 +- community/modules/scheduler/htcondor-configure/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 15 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 38d02a9f68..1a99a4f59d 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -42,7 +42,7 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.5.0", + Version: "v1.6.0", Annotations: annotation, } ) diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf index 056ac4edc4..7688479f49 100644 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf +++ b/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-partition/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-partition/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index cf1f5fe1cb..83fc3bcb62 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.6.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 43c741ea6b..dd04c20cf8 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 04b4813a6b..c0c446cfa0 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf index 3e4689a994..a06a6a442f 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf index 5608f15aac..ed4fa6ad84 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/cloud-batch-login-node/versions.tf b/community/modules/scheduler/cloud-batch-login-node/versions.tf index bf49cb9e0d..9bf593bd90 100644 --- a/community/modules/scheduler/cloud-batch-login-node/versions.tf +++ b/community/modules/scheduler/cloud-batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-batch-login-node/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-batch-login-node/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/htcondor-configure/versions.tf b/community/modules/scheduler/htcondor-configure/versions.tf index 0a4c1c40fa..d62fbd8d09 100644 --- a/community/modules/scheduler/htcondor-configure/versions.tf +++ b/community/modules/scheduler/htcondor-configure/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-configure/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-configure/v1.6.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 0a14e4cca1..bc175d5e04 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 6a3d002173..1cf5dad238 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -27,10 +27,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 00469f5359..a4fe24b1fc 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index c1af75d433..4aa824c1a3 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 2122ca966e..ba9e4e34d7 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 277f664325..fa9846ca54 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.6.0" } required_version = ">= 0.14.0"