From 17399f0b40fd17a914ed126a2307412b9ba04ea4 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 17 Jan 2024 05:07:35 +0000 Subject: [PATCH] Update spack gromac example tutorial and reference to use Slurm V6 --- docs/tutorials/gromacs/spack-gromacs.md | 67 +++++++++-------------- docs/tutorials/gromacs/spack-gromacs.yaml | 52 ++++++++++-------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index c8719aaba7..ce8400e1e5 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the Gromacs application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 2 hr. to complete, -of which 1.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 2 hr. to complete, +of which 1.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -84,7 +84,6 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster - * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -106,27 +105,13 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-gromacs/primary init -terraform -chdir=spack-gromacs/primary apply +./ghpc deploy spack-gromacs ``` -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - +After the deployment is finished, you should see below message. ```shell Apply complete! Resources: xx added, 0 changed, 0 destroyed. @@ -144,30 +129,30 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-gromacs-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackgroma-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-gromacs-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackgroma-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-gromacs-controller` and `slurm-spack-gromacs-login0`. If you don't +`spackgroma-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the login node +## Connecting to the controller node -Once the startup script has completed, connect to the login node. +Once the startup script has completed, connect to the controller node. -Use the following command to ssh into the login node from cloud shell: +Use the following command to ssh into the controller node from cloud shell: ```bash -gcloud compute ssh slurm-spack-gromacs-login0 --zone us-central1-c --project +gcloud compute ssh spackgroma-controller --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,15 +176,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-gromacs-login0` +1. Click on the `SSH` button associated with the `spackgroma-controller` instance. This will open a separate pop up window with a terminal into our newly - created Slurm login VM. + created Slurm controller VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm login node.** + **The commands below should be run on the Slurm controller node.** We will use the submission script (see line 122 of the blueprint) to submit a Gromacs job. @@ -213,7 +198,7 @@ Gromacs job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/gromacs/submit_gromacs.sh + sbatch /opt/apps/gromacs/submit_gromacs.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +212,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-gromacs-compute-0-0`. +`spackgroma-comput-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -247,8 +232,8 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `md.log` and `slurm-1.out` files have information on the run such as -performance. You can view these files by running the following commandsq on the -login node: +performance. You can view these files by running the following commands on the +controller node: ```bash cat slurm-*.out @@ -273,9 +258,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the login node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash @@ -285,7 +270,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-gromacs/primary destroy -auto-approve +./ghpc destroy spack-gromacs ``` When complete you should see something like: diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index fe5bf475b1..285443c0b8 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -88,7 +88,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -107,22 +107,26 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + # remove lustre client temporary to avoid startup failure due to known + # issue. + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: setup_gromacs.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs - chmod -R a+rwX /apps/spack/var/spack/environments/gromacs - mkdir -p /apps/gromacs - chmod a+rwx /apps/gromacs - cd /apps/gromacs + cd /opt/apps/gromacs wget --no-verbose https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz tar xzf water_GMX50_bare.tar.gz - type: data - destination: /apps/gromacs/submit_gromacs.sh + destination: /opt/apps/gromacs/submit_gromacs.sh content: | #!/bin/bash #SBATCH -N 2 @@ -131,36 +135,36 @@ deployment_groups: # Size can be 0000.65 0000.96 0001.5 0003 0006 0012 0024 0048 0096 0192 0384 0768 1536 3072 # Type can be 'pme' or 'rf' - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs # Check that gmx_mpi exists which gmx_mpi cd $SLURM_SUBMIT_DIR - cp /apps/gromacs/water-cut1.0_GMX50_bare/1536/* . + cp /opt/apps/gromacs/water-cut1.0_GMX50_bare/1536/* . scontrol show hostnames ${SLURM_JOB_NODELIST} > hostfile gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr mpirun -n 60 -hostfile hostfile -ppn 30 gmx_mpi mdrun -notunepme -dlb yes -v -resethway -noconfout -nsteps 4000 -s input.tpr + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + bandwidth_tier: gvnic_enabled + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition settings: + disable_controller_public_ips: false + controller_startup_scripts_timeout: 21600 controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller