Merge pull request #19 from oracle-quickstart/2.10.1

2.10.1
oracle-quickstart · Mar 24, 2023 · 25c5704 · 25c5704
2 parents 10a42e0 + 87caac8
commit 25c5704
Show file tree

Hide file tree

Showing 129 changed files with 2,612 additions and 735 deletions.
diff --git a/README.md b/README.md
@@ -265,7 +265,7 @@ Example:
 The name of the cluster must be
 queueName-clusterNumber-instanceType_keyword
 
-The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be regirstered in Slurm
+The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm
 
 ### Cluster Deletion: 
 ```
@@ -293,8 +293,8 @@ Example of cluster command to add a new user:
 ```cluster user add name```
 By default, a `privilege` group is created that has access to the NFS and can have sudo access on all nodes (Defined at the stack creation. This group has ID 9876) The group name can be modified.
 ```cluster user add name --gid 9876```
-To generate a user-specific key for passwordless ssh between nodes, use --ssh. 
-```cluster user add name --ssh --gid 9876```
+To avoid generating a user-specific key for passwordless ssh between nodes, use --nossh. 
+```cluster user add name --nossh --gid 9876```
 
 # Shared home folder
 
@@ -318,3 +318,43 @@ $ max_nodes --> Information about all the partitions and their respective cluste
 
 $ max_nodes --include_cluster_names xxx yyy zzz --> where xxx, yyy, zzz are cluster names. Provide a space separated list of cluster names to be considered for displaying the information about clusters and maximum number of nodes distributed evenly per partition
 
+
+## validation.py usage
+
+Use the alias "validate" to run the python script validation.py. You can run this script only from bastion. 
+
+The script performs these checks. 
+-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files.
+-> PCIe bandwidth check 
+-> GPU Throttle check 
+-> Check whether md5 sum of /etc/hosts file on nodes matches that on bastion
+
+Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]
+
+Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]: [-cn CLUSTER_NAMES]
+Provide a file that lists each cluster on a separate line for which you want to validate the number of nodes and/or pcie check and/or gpu throttle check and/or /etc/hosts md5 sum. 
+
+For pcie, gpu throttle, and /etc/hosts md5 sum check, you can either provide y or Y along with -cn or you can give the hostfile path (each host on a separate line) for each argument. For number of nodes check, either provide y or give y along with -cn.
+
+Below are some examples for running this script.
+
+validate -n y --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. 
+
+validate -n y -cn <cluster name file> --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. 
+
+validate -p y -cn <cluster name file> --> This will run the pcie bandwidth check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. 
+
+validate -p <pcie host file> --> This will run the pcie bandwidth check on the hosts provided in the file given. The pcie host file should have a host name on each line.
+
+validate -g y -cn <cluster name file> --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. 
+
+validate -g <gpu check host file> --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line.
+
+validate -e y -cn <cluster name file> --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. 
+
+validate -e <md5 sum check host file> --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line.
+
+You can combine all the options together such as:
+validate -n y -p y -g y -e y -cn <cluster name file>
+
+
diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
@@ -169,6 +169,11 @@ def getClusterName(node):
             for output in stdout.split('\n')[:-1]:
                 if "Switches=" in output:
                     clusterName=output.split()[0].split('SwitchName=')[1]
+                    break
+                elif "SwitchName=inactive-" in output:
+                    continue
+                else:
+                    clusterName=output.split()[0].split('SwitchName=')[1]
         elif len(stdout.split('\n')) == 2:
             clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
         if clusterName.startswith("inactive-"):
@@ -352,7 +357,7 @@ try:
         cluster_name=cluster[0]
         print ("Deleting cluster "+cluster_name)
         subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
-        time.sleep(1)
+        time.sleep(5)
 
     for cluster_name in nodes_to_destroy.keys():
         print ("Resizing cluster "+cluster_name)
@@ -374,7 +379,6 @@ try:
             subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
         if len(unreachable_nodes) > 0:
             subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
-
         time.sleep(1)
 
     for index,cluster in enumerate(cluster_to_build):

diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf
@@ -22,10 +22,14 @@ resource "local_file" "inventory" {
     bastion_ip = var.bastion_ip, 
     backup_name = var.backup_name,
     backup_ip = var.backup_ip,
+    login_name = var.login_name,
+    login_ip = var.login_ip,
     compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
     public_subnet = var.public_subnet, 
     private_subnet = var.private_subnet, 
-    nfs = local.cluster_instances_names[0],
+    rdma_network = cidrhost(var.rdma_subnet, 0),
+    rdma_netmask = cidrnetmask(var.rdma_subnet),
+    nfs = var.use_scratch_nfs ? local.cluster_instances_names[0] : "",
     scratch_nfs = var.use_scratch_nfs,
     cluster_nfs = var.use_cluster_nfs,
     home_nfs = var.home_nfs,
@@ -53,7 +57,7 @@ resource "local_file" "inventory" {
     cluster_mount_ip = local.mount_ip,
     cluster_name = local.cluster_name,
     shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
-    instance_pool_ocpus=var.instance_pool_ocpus,
+    instance_pool_ocpus=local.instance_pool_ocpus,
     queue=var.queue,
     instance_type=var.instance_type,
     autoscaling_monitoring = var.autoscaling_monitoring,
@@ -63,7 +67,9 @@ resource "local_file" "inventory" {
     privilege_group_name = var.privilege_group_name,
     latency_check = var.latency_check
     bastion_username = var.bastion_username,
-    compute_username = var.compute_username
+    compute_username = var.compute_username,
+    pam = var.pam,
+    sacct_limits = var.sacct_limits
     })
   filename   = "${local.bastion_path}/inventory"
 }

diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl
@@ -2,6 +2,8 @@
 ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion
 [slurm_backup]
 %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${bastion_username} role=bastion%{ endif }
+[login]
+%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
 [compute_to_add]
 [compute_configured]
 %{ for host, ip in compute ~}
@@ -12,15 +14,15 @@ ${host} ansible_host=${ip} ansible_user=${compute_username} role=compute
 compute_to_add
 compute_configured
 [nfs]
-${nfs}
+%{ if nfs != "" }${nfs} ansible_user=${compute_username} role=nfs%{ endif }
 [all:children]
 bastion
 compute
 [all:vars]
 ansible_connection=ssh
 ansible_user=${compute_username}
-rdma_network=192.168.128.0
-rdma_netmask=255.255.240.0
+rdma_network=${rdma_network}
+rdma_netmask=${rdma_netmask}
 public_subnet=${public_subnet} 
 private_subnet=${private_subnet}
 nvme_path=/mnt/localdisk/
@@ -62,3 +64,5 @@ privilege_group_name=${privilege_group_name}
 latency_check=${latency_check}
 compute_username=${compute_username}
 bastion_username=${bastion_username}
+pam = ${pam}
+sacct_limits=${sacct_limits}
diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf
@@ -3,6 +3,9 @@ locals {
   cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id
   cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name
   image_ocid = var.unsupported ? var.image_ocid : var.image 
+
+  shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
+  instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
 // ips of the instances
   cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
 
@@ -20,7 +23,7 @@ locals {
 //  image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id
 
 //  is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[]
-  is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[]
+  is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[]
 
 //  bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none"
 

diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "4.99.0"
+         version = "4.112.0"
      }
   }
 }
diff --git a/bastion.tf b/bastion.tf
@@ -74,6 +74,7 @@ resource "null_resource" "bastion" {
 
   provisioner "remote-exec" {
     inline = [
+      "#!/bin/bash",
       "sudo mkdir -p /opt/oci-hpc",      
       "sudo chown ${var.bastion_username}:${var.bastion_username} /opt/oci-hpc/",
       "mkdir -p /opt/oci-hpc/bin",
@@ -176,6 +177,7 @@ resource "null_resource" "bastion" {
 
   provisioner "remote-exec" {
     inline = [
+      "#!/bin/bash",
       "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key",
       "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa",
       "chmod a+x /opt/oci-hpc/bin/*.sh",
@@ -201,12 +203,14 @@ resource "null_resource" "cluster" {
       bastion_ip = oci_core_instance.bastion.private_ip,
       backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "",
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
+      login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
+      login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block, 
       private_subnet = data.oci_core_subnet.private_subnet.cidr_block, 
       rdma_network = cidrhost(var.rdma_subnet, 0),
       rdma_netmask = cidrnetmask(var.rdma_subnet),
-      nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "",
+      nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "",
       home_nfs = var.home_nfs,
       create_fss = var.create_fss,
       home_fss = var.home_fss,
@@ -232,8 +236,8 @@ resource "null_resource" "cluster" {
       cluster_mount_ip = local.mount_ip,
       autoscaling = var.autoscaling,
       cluster_name = local.cluster_name,
-      shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
-      instance_pool_ocpus = var.instance_pool_ocpus,
+      shape = local.shape,
+      instance_pool_ocpus = local.instance_pool_ocpus,
       queue=var.queue,
       monitoring = var.monitoring,
       hyperthreading = var.hyperthreading,
@@ -248,7 +252,14 @@ resource "null_resource" "cluster" {
       pyxis = var.pyxis,
       privilege_sudo = var.privilege_sudo,
       privilege_group_name = var.privilege_group_name,
-      latency_check = var.latency_check
+      latency_check = var.latency_check,
+      pam = var.pam,
+      sacct_limits = var.sacct_limits,
+      inst_prin = var.inst_prin,
+      region = var.region,
+      tenancy_ocid = var.tenancy_ocid,
+      api_fingerprint = var.api_fingerprint,
+      api_user_ocid = var.api_user_ocid
       })
 
     destination   = "/opt/oci-hpc/playbooks/inventory"
@@ -303,7 +314,7 @@ resource "null_resource" "cluster" {
       private_subnet = data.oci_core_subnet.private_subnet.cidr_block,
       private_subnet_id = local.subnet_id,
       targetCompartment = var.targetCompartment,
-      instance_pool_ocpus = var.instance_pool_ocpus,
+      instance_pool_ocpus = local.instance_pool_ocpus,
       instance_pool_memory = var.instance_pool_memory,
       instance_pool_custom_memory = var.instance_pool_custom_memory,
       queue=var.queue,
@@ -325,14 +336,18 @@ resource "null_resource" "cluster" {
       bastion_ip = oci_core_instance.bastion.private_ip, 
       backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "",
       backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "",
+      login_name = var.login_node ? oci_core_instance.login[0].display_name : "",
+      login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "",
       compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
       public_subnet = data.oci_core_subnet.public_subnet.cidr_block,
       public_subnet_id = local.bastion_subnet_id,
       private_subnet = data.oci_core_subnet.private_subnet.cidr_block,
       private_subnet_id = local.subnet_id,
+      rdma_subnet = var.rdma_subnet,
       nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "",
       scratch_nfs = var.use_scratch_nfs && var.node_count > 0,
       scratch_nfs_path = var.scratch_nfs_path,
+      use_scratch_nfs = var.use_scratch_nfs,
       slurm = var.slurm,
       rack_aware = var.rack_aware,
       slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path
@@ -376,7 +391,9 @@ resource "null_resource" "cluster" {
       private_deployment = var.private_deployment,
       use_multiple_ads = var.use_multiple_ads,
       bastion_username = var.bastion_username,
-      compute_username = var.compute_username
+      compute_username = var.compute_username,
+      pam = var.pam,
+      sacct_limits = var.sacct_limits
       })
 
     destination   = "/opt/oci-hpc/conf/variables.tf"
@@ -409,7 +426,7 @@ provisioner "file" {
   }
   provisioner "file" {
     content     = base64decode(var.api_user_key)
-    destination   = "/opt/oci-hpc/autoscaling/credentials/key.initial" 
+    destination   = "/opt/oci-hpc/autoscaling/credentials/key.pem" 
     connection {
       host        = local.host
       type        = "ssh"
@@ -420,13 +437,12 @@ provisioner "file" {
 
   provisioner "remote-exec" {
     inline = [
+      "#!/bin/bash",
       "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh",
-      "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh",
-      "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log",
       "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem",
       "echo ${var.configure} > /tmp/configure.conf",
-      "timeout 2h /opt/oci-hpc/bin/configure.sh",
-      "exit_code=$?",
+      "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log",
+      "exit_code=$${PIPESTATUS[0]}",
       "/opt/oci-hpc/bin/initial_monitoring.sh",
       "exit $exit_code"     ]
     connection {