From 4177158c87bc52fca47ffc3ff7b5b185ffef1742 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 4 Jan 2023 19:23:33 -0800 Subject: [PATCH 001/133] added repo.mysql.com as yum repo and then installed mysql80-community-release-el7-7 --- playbooks/roles/autoscaling_mon/tasks/el.yml | 29 ++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index 8ecfeba6..c66d6031 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -81,16 +81,35 @@ no_log: false register: existing_api_keys -- name: Import mysql-2022 key +# - name: Import mysql-2022 key +# become: true +# rpm_key: +# state: present +# key: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 + +# - name: install mysql +# vars: +# package_name: +# - https://dev.mysql.com/get/mysql80-community-release-el7-5.noarch.rpm +# include_role: +# name: safe_yum + +- name: Add repo.mysql.com repo become: true - rpm_key: - state: present - key: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 + yum_repository: + name: repo_mysql + description: RPM MySQL YUM repo + file: external_repos + baseurl: https://repo.mysql.com/yum/mysql-8.0-community/el/7/$basearch/ + gpgkey: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 + gpgcheck: 1 + enabled: 1 - name: install mysql vars: package_name: - - https://dev.mysql.com/get/mysql80-community-release-el7-5.noarch.rpm + - mysql80-community-release-el7-7 + state: present include_role: name: safe_yum From 414676af4054c50e2703940cc1375672e39ca51c Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 5 Jan 2023 10:33:12 -0800 Subject: [PATCH 002/133] removing the installation of mysql here as it is already being installed using the mysql role --- playbooks/roles/autoscaling_mon/tasks/el.yml | 32 -------------------- 1 file changed, 32 deletions(-) diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index c66d6031..9d9946d4 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -81,38 +81,6 @@ no_log: false register: existing_api_keys -# - name: Import mysql-2022 key -# become: true -# rpm_key: -# state: present -# key: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 - -# - name: install mysql -# vars: -# package_name: -# - https://dev.mysql.com/get/mysql80-community-release-el7-5.noarch.rpm -# include_role: -# name: safe_yum - -- name: Add repo.mysql.com repo - become: true - yum_repository: - name: repo_mysql - description: RPM MySQL YUM repo - file: external_repos - baseurl: https://repo.mysql.com/yum/mysql-8.0-community/el/7/$basearch/ - gpgkey: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 - gpgcheck: 1 - enabled: 1 - -- name: install mysql - vars: - package_name: - - mysql80-community-release-el7-7 - state: present - include_role: - name: safe_yum - - name: install mysql-shell and connector vars: package_name: From 92e5004eaa9ea64376f6fa236c5254f2296ae2e2 Mon Sep 17 00:00:00 2001 From: bruno-garbaccio Date: Fri, 6 Jan 2023 11:48:50 +0100 Subject: [PATCH 003/133] fix no instance principal auth In case of no instance principal, allow autoscaling with api key and create config file for oci-cli. Get rid of --auth instance_principal options in create_cluster.sh --- bastion.tf | 11 +++++--- inventory.tpl | 8 +++++- .../no_instance_principal/defaults/main.yml | 0 .../roles/no_instance_principal/meta/main.yml | 0 .../roles/no_instance_principal/tasks/el.yml | 25 +++++++++++++++++++ .../no_instance_principal/tasks/main.yml | 3 +++ .../no_instance_principal/templates/config.j2 | 6 +++++ .../roles/no_instance_principal/vars/main.yml | 0 playbooks/site.yml | 10 ++++++++ slurm_ha.tf | 9 +++++-- 10 files changed, 65 insertions(+), 7 deletions(-) create mode 100755 playbooks/roles/no_instance_principal/defaults/main.yml create mode 100755 playbooks/roles/no_instance_principal/meta/main.yml create mode 100755 playbooks/roles/no_instance_principal/tasks/el.yml create mode 100755 playbooks/roles/no_instance_principal/tasks/main.yml create mode 100644 playbooks/roles/no_instance_principal/templates/config.j2 create mode 100755 playbooks/roles/no_instance_principal/vars/main.yml diff --git a/bastion.tf b/bastion.tf index 7ee4ede6..5c31e0a9 100644 --- a/bastion.tf +++ b/bastion.tf @@ -248,7 +248,12 @@ resource "null_resource" "cluster" { pyxis = var.pyxis, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check + latency_check = var.latency_check, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -409,7 +414,7 @@ provisioner "file" { } provisioner "file" { content = base64decode(var.api_user_key) - destination = "/opt/oci-hpc/autoscaling/credentials/key.initial" + destination = "/opt/oci-hpc/autoscaling/credentials/key.pem" connection { host = local.host type = "ssh" @@ -421,8 +426,6 @@ provisioner "file" { provisioner "remote-exec" { inline = [ "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", - "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh", - "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", "timeout 2h /opt/oci-hpc/bin/configure.sh", diff --git a/inventory.tpl b/inventory.tpl index 8b941154..5ddb3c32 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -62,4 +62,10 @@ privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} latency_check=${latency_check} compute_username=${compute_username} -bastion_username=${bastion_username} \ No newline at end of file +bastion_username=${bastion_username} +region= ${region} +tenancy_ocid = ${tenancy_ocid} +inst_prin = ${inst_prin} +api_fingerprint = ${api_fingerprint} +api_user_ocid = ${api_user_ocid} + diff --git a/playbooks/roles/no_instance_principal/defaults/main.yml b/playbooks/roles/no_instance_principal/defaults/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/roles/no_instance_principal/meta/main.yml b/playbooks/roles/no_instance_principal/meta/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/roles/no_instance_principal/tasks/el.yml b/playbooks/roles/no_instance_principal/tasks/el.yml new file mode 100755 index 00000000..33ede7b3 --- /dev/null +++ b/playbooks/roles/no_instance_principal/tasks/el.yml @@ -0,0 +1,25 @@ +--- +- name: create .oci directory + become: true + file: + path: /home/opc/.oci + state: directory + owner: opc + group: "{{privilege_group_name}}" + mode: 0775 + +- name: Generate config file + become: true + template: + src: 'config.j2' + dest: '/home/opc/.oci/config' + mode: 0600 + owner: opc + group: "{{privilege_group_name}}" + +- name: delete --auth in create_cluster.sh + become: true + replace: + path: /opt/oci-hpc/bin/create_cluster.sh + regexp: '--auth instance_principal' + replace: '' \ No newline at end of file diff --git a/playbooks/roles/no_instance_principal/tasks/main.yml b/playbooks/roles/no_instance_principal/tasks/main.yml new file mode 100755 index 00000000..01de0d59 --- /dev/null +++ b/playbooks/roles/no_instance_principal/tasks/main.yml @@ -0,0 +1,3 @@ +- include: el.yml + when: ansible_os_family == 'RedHat' + diff --git a/playbooks/roles/no_instance_principal/templates/config.j2 b/playbooks/roles/no_instance_principal/templates/config.j2 new file mode 100644 index 00000000..31a1d924 --- /dev/null +++ b/playbooks/roles/no_instance_principal/templates/config.j2 @@ -0,0 +1,6 @@ +[DEFAULT] +user={{ api_user_ocid }} +fingerprint={{ api_fingerprint }} +tenancy={{ tenancy_ocid}} +region={{ region }} +key_file=/opt/oci-hpc/autoscaling/credentials/key.pem \ No newline at end of file diff --git a/playbooks/roles/no_instance_principal/vars/main.yml b/playbooks/roles/no_instance_principal/vars/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/site.yml b/playbooks/site.yml index 746adf8f..0129b1f9 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -110,6 +110,16 @@ name: cluster-cli when: ldap|default(true)|bool +# configure if instance_principal is False +- hosts: bastion + become: true + tasks: + - include_role: + name: no_instance_principal + when: not inst_prin|bool + + + - hosts: compute become: true tasks: diff --git a/slurm_ha.tf b/slurm_ha.tf index 2a63c274..5d74d7ac 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -242,7 +242,12 @@ resource "null_resource" "cluster_backup" { pyxis = var.pyxis, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check + latency_check = var.latency_check, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -370,7 +375,7 @@ resource "null_resource" "cluster_backup" { private_deployment = var.private_deployment, bastion_username = var.bastion_username, compute_username = var.compute_username, - use_multiple_ads = var.use_multiple_ads + use_multiple_ads = var.use_multiple_ads }) destination = "/opt/oci-hpc/conf/variables.tf" From e9a43ca6134d552b0dedf5a4a3a2e6d6174d754a Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 9 Jan 2023 12:08:09 -0800 Subject: [PATCH 004/133] enroot squash and rootfs options added to enroot.conf --- .../roles/nvidia-enroot/tasks/oraclelinux-7.yml | 16 ++++++++++++++++ playbooks/roles/nvidia-enroot/tasks/ubuntu.yml | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml index 7b531689..bb584fdd 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml @@ -69,6 +69,22 @@ line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' backup: yes + - name: update ENROOT_SQUASH_OPTIONS + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_SQUASH_OPTIONS.*' + line: 'ENROOT_SQUASH_OPTIONS -b 262144' + backup: yes + + - name: update ENROOT_ROOTFS_WRITABLE + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_ROOTFS_WRITABLE.*' + line: 'ENROOT_ROOTFS_WRITABLE yes' + backup: yes + - name: set permissions on {{enroot_top_path}} become: true diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index d8c9619a..00828be1 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -68,6 +68,22 @@ line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' backup: yes + - name: update ENROOT_SQUASH_OPTIONS + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_SQUASH_OPTIONS.*' + line: 'ENROOT_SQUASH_OPTIONS -b 262144' + backup: yes + + - name: update ENROOT_ROOTFS_WRITABLE + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_ROOTFS_WRITABLE.*' + line: 'ENROOT_ROOTFS_WRITABLE yes' + backup: yes + - name: set permissions on {{enroot_top_path}} become: true From 49f88426dc9dc976c89f06655ffc0433c11c5d63 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 14:45:34 -0800 Subject: [PATCH 005/133] Create tuned.conf --- files/tuned.conf | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 files/tuned.conf diff --git a/files/tuned.conf b/files/tuned.conf new file mode 100644 index 00000000..f4e0db7a --- /dev/null +++ b/files/tuned.conf @@ -0,0 +1,39 @@ +[main] +summary=gpfs perf tuning for common GPU workloads + +[cpu] +force_latency=1 +governor=performance +energy_perf_bias=performance +min_perf_pct=100 + +[vm] +transparent_huge_pages=never + +[sysctl] +net.ipv4.tcp_timestamps=1 +net.ipv4.tcp_sack=1 +net.ipv4.tcp_dsack=1 +net.ipv4.tcp_low_latency=1 +net.ipv4.tcp_adv_win_scale=2 +net.ipv4.tcp_window_scaling=1 +net.ipv4.tcp_slow_start_after_idle=0 +net.ipv4.tcp_syn_retries=8 +net.ipv4.tcp_rmem=4096 87380 16777216 +net.ipv4.tcp_wmem=4096 65536 16777216 +net.core.rmem_max=16777216 +net.core.wmem_max=16777216 +net.core.rmem_default=16777216 +net.core.wmem_default=16777216 +net.core.optmem_max=16777216 +net.core.somaxconn = 8192 +net.core.netdev_max_backlog=250000 +sunrpc.udp_slot_table_entries=128 +sunrpc.tcp_slot_table_entries=128 +kernel.sysrq = 1 +kernel.sched_min_granularity_ns = 10000000 +kernel.sched_wakeup_granularity_ns = 15000000 +vm.min_free_kbytes = 16777216 +vm.dirty_ratio = 30 +vm.dirty_background_ratio = 10 +vm.swappiness=30 From a3889484dcdec36ec95c08e0a3afa3526934fb43 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 14:46:23 -0800 Subject: [PATCH 006/133] Delete tuned.conf --- files/tuned.conf | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 files/tuned.conf diff --git a/files/tuned.conf b/files/tuned.conf deleted file mode 100644 index f4e0db7a..00000000 --- a/files/tuned.conf +++ /dev/null @@ -1,39 +0,0 @@ -[main] -summary=gpfs perf tuning for common GPU workloads - -[cpu] -force_latency=1 -governor=performance -energy_perf_bias=performance -min_perf_pct=100 - -[vm] -transparent_huge_pages=never - -[sysctl] -net.ipv4.tcp_timestamps=1 -net.ipv4.tcp_sack=1 -net.ipv4.tcp_dsack=1 -net.ipv4.tcp_low_latency=1 -net.ipv4.tcp_adv_win_scale=2 -net.ipv4.tcp_window_scaling=1 -net.ipv4.tcp_slow_start_after_idle=0 -net.ipv4.tcp_syn_retries=8 -net.ipv4.tcp_rmem=4096 87380 16777216 -net.ipv4.tcp_wmem=4096 65536 16777216 -net.core.rmem_max=16777216 -net.core.wmem_max=16777216 -net.core.rmem_default=16777216 -net.core.wmem_default=16777216 -net.core.optmem_max=16777216 -net.core.somaxconn = 8192 -net.core.netdev_max_backlog=250000 -sunrpc.udp_slot_table_entries=128 -sunrpc.tcp_slot_table_entries=128 -kernel.sysrq = 1 -kernel.sched_min_granularity_ns = 10000000 -kernel.sched_wakeup_granularity_ns = 15000000 -vm.min_free_kbytes = 16777216 -vm.dirty_ratio = 30 -vm.dirty_background_ratio = 10 -vm.swappiness=30 From 4ae409192cc701bfc6a1d3aca9577f4d909b4748 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 14:46:45 -0800 Subject: [PATCH 007/133] Create tuned.conf --- playbooks/roles/tuned/files/tuned.conf | 39 ++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 playbooks/roles/tuned/files/tuned.conf diff --git a/playbooks/roles/tuned/files/tuned.conf b/playbooks/roles/tuned/files/tuned.conf new file mode 100644 index 00000000..f4e0db7a --- /dev/null +++ b/playbooks/roles/tuned/files/tuned.conf @@ -0,0 +1,39 @@ +[main] +summary=gpfs perf tuning for common GPU workloads + +[cpu] +force_latency=1 +governor=performance +energy_perf_bias=performance +min_perf_pct=100 + +[vm] +transparent_huge_pages=never + +[sysctl] +net.ipv4.tcp_timestamps=1 +net.ipv4.tcp_sack=1 +net.ipv4.tcp_dsack=1 +net.ipv4.tcp_low_latency=1 +net.ipv4.tcp_adv_win_scale=2 +net.ipv4.tcp_window_scaling=1 +net.ipv4.tcp_slow_start_after_idle=0 +net.ipv4.tcp_syn_retries=8 +net.ipv4.tcp_rmem=4096 87380 16777216 +net.ipv4.tcp_wmem=4096 65536 16777216 +net.core.rmem_max=16777216 +net.core.wmem_max=16777216 +net.core.rmem_default=16777216 +net.core.wmem_default=16777216 +net.core.optmem_max=16777216 +net.core.somaxconn = 8192 +net.core.netdev_max_backlog=250000 +sunrpc.udp_slot_table_entries=128 +sunrpc.tcp_slot_table_entries=128 +kernel.sysrq = 1 +kernel.sched_min_granularity_ns = 10000000 +kernel.sched_wakeup_granularity_ns = 15000000 +vm.min_free_kbytes = 16777216 +vm.dirty_ratio = 30 +vm.dirty_background_ratio = 10 +vm.swappiness=30 From 03980a4f720194e3669879b75c0c999e425ffbf1 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 14:47:34 -0800 Subject: [PATCH 008/133] Create main.yml --- playbooks/roles/tuned/tasks/main.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 playbooks/roles/tuned/tasks/main.yml diff --git a/playbooks/roles/tuned/tasks/main.yml b/playbooks/roles/tuned/tasks/main.yml new file mode 100644 index 00000000..b0f30aa2 --- /dev/null +++ b/playbooks/roles/tuned/tasks/main.yml @@ -0,0 +1,2 @@ + - include: el-7.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' From 25e22405cba879133c5a13370bd49649d967cf8f Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 14:48:02 -0800 Subject: [PATCH 009/133] Create el-7.yml --- playbooks/roles/tuned/tasks/el-7.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 playbooks/roles/tuned/tasks/el-7.yml diff --git a/playbooks/roles/tuned/tasks/el-7.yml b/playbooks/roles/tuned/tasks/el-7.yml new file mode 100644 index 00000000..2dc0f0a6 --- /dev/null +++ b/playbooks/roles/tuned/tasks/el-7.yml @@ -0,0 +1,17 @@ +--- + +- name: Ensure tuned profile directory exists + become: true + file: + path='/usr/lib/tuned/oci-network-performance' + state=directory + +- name: Copy profile file + become: true + copy: + src: tuned.conf + dest: "/usr/lib/tuned/oci-network-performance/tuned.conf" + +- name: Start profile + become: true + shell: tuned-adm profile oci-network-performance From 789d3802efbd8d1d04288f2dffc6912f0023d2e7 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 16:39:14 -0800 Subject: [PATCH 010/133] Update tuned.conf --- playbooks/roles/tuned/files/tuned.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/tuned/files/tuned.conf b/playbooks/roles/tuned/files/tuned.conf index f4e0db7a..c1bff86a 100644 --- a/playbooks/roles/tuned/files/tuned.conf +++ b/playbooks/roles/tuned/files/tuned.conf @@ -1,5 +1,5 @@ [main] -summary=gpfs perf tuning for common GPU workloads +summary=Perf tuning for common GPU workloads [cpu] force_latency=1 From e9630dcb5548dbaf06cabb768207d9fc530e7c10 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Mon, 9 Jan 2023 16:39:53 -0800 Subject: [PATCH 011/133] Update site.yml --- playbooks/site.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/playbooks/site.yml b/playbooks/site.yml index 746adf8f..8c6c29a8 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -292,4 +292,10 @@ name: slurm when: slurm|default(false)|bool +- hosts: compute + become: true + gather_facts: true + tasks: + - include_role: + name: tuned From 95de51ee53ceeb5a9542fed838f7700b16725b44 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 10 Jan 2023 10:46:06 -0700 Subject: [PATCH 012/133] Make sure no NFS is created if it is not needed. --- autoscaling/tf_init/bastion_update.tf | 2 +- autoscaling/tf_init/inventory.tpl | 2 +- bastion.tf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 18fc5889..e03a1cc7 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -25,7 +25,7 @@ resource "local_file" "inventory" { compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, - nfs = local.cluster_instances_names[0], + nfs = scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs, cluster_nfs = var.use_cluster_nfs, home_nfs = var.home_nfs, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 11d848f0..fcc97be2 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -12,7 +12,7 @@ ${host} ansible_host=${ip} ansible_user=${compute_username} role=compute compute_to_add compute_configured [nfs] -${nfs} +%{ if nfs != "" }${nfs} ansible_user=${compute_username} role=nfs%{ endif } [all:children] bastion compute diff --git a/bastion.tf b/bastion.tf index 7ee4ede6..2560c2ef 100644 --- a/bastion.tf +++ b/bastion.tf @@ -206,7 +206,7 @@ resource "null_resource" "cluster" { private_subnet = data.oci_core_subnet.private_subnet.cidr_block, rdma_network = cidrhost(var.rdma_subnet, 0), rdma_netmask = cidrnetmask(var.rdma_subnet), - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + nfs = var.node_count > 0 && scratch_nfs ? local.cluster_instances_names[0] : "", home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, From 5748849b5c5b631f2ab714378947ebaa203c539e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 10 Jan 2023 10:47:29 -0700 Subject: [PATCH 013/133] Add E5-hpc to list of supported shapes --- playbooks/roles/slurm/templates/slurm.conf.j2 | 2 ++ schema.yaml | 1 + 2 files changed, 3 insertions(+) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 23999c59..9b32b137 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -91,6 +91,8 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC2.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.HPC.E5.128" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Optimized3.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif "VM.Standard2." in instance.shape %} diff --git a/schema.yaml b/schema.yaml index 7ad5400b..3c207916 100755 --- a/schema.yaml +++ b/schema.yaml @@ -462,6 +462,7 @@ variables: - "BM.GPU.B4.8" - "BM.GPU.A100-v2.8" - "BM.Optimized3.36" + - "BM.HPC.E5.128" default: "BM.HPC2.36" title: "Shape of the Compute Nodes" description: "Shape of compute nodes used in permanent/initial cluster" From 0cad514c6e29d6acc4a8f7395cc63c5e1e569730 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Tue, 10 Jan 2023 12:26:54 -0800 Subject: [PATCH 014/133] Update main.yml --- playbooks/roles/tuned/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/tuned/tasks/main.yml b/playbooks/roles/tuned/tasks/main.yml index b0f30aa2..637e8bae 100644 --- a/playbooks/roles/tuned/tasks/main.yml +++ b/playbooks/roles/tuned/tasks/main.yml @@ -1,2 +1,2 @@ - include: el-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') From f24943d2e6dfe69a02e3ce409f472187807b4b4a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 11 Jan 2023 13:51:18 -0700 Subject: [PATCH 015/133] Fix issue with NFS --- autoscaling/tf_init/bastion_update.tf | 2 +- bastion.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index e03a1cc7..4f775e06 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -25,7 +25,7 @@ resource "local_file" "inventory" { compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, - nfs = scratch_nfs ? local.cluster_instances_names[0] : "", + nfs = var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs, cluster_nfs = var.use_cluster_nfs, home_nfs = var.home_nfs, diff --git a/bastion.tf b/bastion.tf index 2560c2ef..6cc9379e 100644 --- a/bastion.tf +++ b/bastion.tf @@ -206,7 +206,7 @@ resource "null_resource" "cluster" { private_subnet = data.oci_core_subnet.private_subnet.cidr_block, rdma_network = cidrhost(var.rdma_subnet, 0), rdma_netmask = cidrnetmask(var.rdma_subnet), - nfs = var.node_count > 0 && scratch_nfs ? local.cluster_instances_names[0] : "", + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, From d8da189f93c2918536f0e4c8bbbb5bf3725dc717 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 11 Jan 2023 14:36:43 -0700 Subject: [PATCH 016/133] Add adm-tuned to all playbooks --- playbooks/new_nodes.yml | 2 ++ playbooks/resize_add.yml | 2 ++ playbooks/site.yml | 9 ++------- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index d86ac463..f7f7e62a 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -163,6 +163,8 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: compute diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 0327973d..4e01b939 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -165,6 +165,8 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: compute_to_add diff --git a/playbooks/site.yml b/playbooks/site.yml index 8c6c29a8..b2eb5b0e 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -268,6 +268,8 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: all vars: @@ -292,10 +294,3 @@ name: slurm when: slurm|default(false)|bool -- hosts: compute - become: true - gather_facts: true - tasks: - - include_role: - name: tuned - From c32232fdfb3115dfe92a0be2d1e07ba8a4b898d4 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 13 Jan 2023 13:26:06 -0800 Subject: [PATCH 017/133] add validation script for number of nodes same across resize, slurm, etc-hosts, oci cli, inventory, topology. add pcie speed and gpu throttle check as well --- bin/gpu_throttle.sh | 5 + bin/num_nodes_same.py | 484 ++++++++++++++++++++++++++++++++++++++++++ bin/pcie.sh | 8 + 3 files changed, 497 insertions(+) create mode 100644 bin/gpu_throttle.sh create mode 100644 bin/num_nodes_same.py create mode 100644 bin/pcie.sh diff --git a/bin/gpu_throttle.sh b/bin/gpu_throttle.sh new file mode 100644 index 00000000..72e19071 --- /dev/null +++ b/bin/gpu_throttle.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +/usr/bin/nvidia-smi --query-gpu=timestamp,pci.bus,utilization.gpu,utilization.memory,temperature.gpu,power.draw,clocks.mem,clocks.gr,clocks_throttle_reasons.sw_power_cap,clocks_throttle_reasons.hw_thermal_slowdown,clocks_throttle_reasons.hw_power_brake_slowdown,clocks_throttle_reasons.sw_thermal_slowdown,clocks_throttle_reasons.sync_boost,clocks_throttle_reasons.active --format=csv + diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py new file mode 100644 index 00000000..907c9a1a --- /dev/null +++ b/bin/num_nodes_same.py @@ -0,0 +1,484 @@ +import subprocess +import re +import requests +import oci +from datetime import datetime +import argparse +import os +import shlex + + + +# change ownership of all files to opc so that the files can be copied +def changeOwner(path): + cmd = f'sudo chown -R opc:opc {path}' + run_cmd(cmd) + + +def getDateTime(): + # datetime object containing current date and time + now = datetime.now() + dt_string = now.strftime("%m%d%Y%H%M%S") + return dt_string + + +# create directory to hold results +def createDir(): + # Parent Directory path + parent_dir = "/tmp/" + directory = getDateTime() + # Path + path = os.path.join(parent_dir, directory) + try: + os.mkdir(path) + except OSError as error: + print(error) + return path + + +def run_cmd(cmd=None): + """ Run command on shell""" + cmd_split = shlex.split(cmd) + try: + results = subprocess.run(cmd_split, shell=False, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, check=True, encoding='utf8') + output = results.stdout.splitlines() + except subprocess.CalledProcessError as e_process_error: + # print(f"!!! Error in running command [ {cmd} ]. Fatal error exiting!!!") + # print(f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") + return (9000, f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") + return output + + +def get_metadata(): + """ Make a request to metadata endpoint """ + headers = { 'Authorization' : 'Bearer Oracle' } + metadata_url = "http://169.254.169.254/opc/" + metadata_ver = "2" + request_url = metadata_url + "v" + metadata_ver + "/instance/" + return requests.get(request_url, headers=headers).json() + + +def get_summary(comp_ocid,cluster_name): + print(cluster_name) + signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() + computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data + running_clusters = 0 + scaling_clusters = 0 + cn_summary=None + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + print(cn_summary) + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 + if running_clusters > 1: + print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) + if scaling_clusters > 0: + print("The cluster " +cluster_name+ " is scaling. Run this validation after it finishes scaling.") + print(cluster_name) + print(cn_summary) + return cn_summary + + +def get_instances(comp_ocid,cn_ocid): + signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() + computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_cluster_network_instances,comp_ocid,cn_ocid).data + node_list = [] + for instance_summary in instance_summaries: + node_list.append(instance_summary.display_name) + return node_list + + +def parse_inventory(inventory): + try: + inv = open(inventory,"r") + except: + return None + inventory_dict = {} + current_section = None + for line in inv: + if line.strip().startswith("[") and line.strip().endswith("]"): + current_section=line.split('[')[1].split(']')[0] + if not current_section in inventory_dict.keys(): + inventory_dict[current_section]=[] + else: + if not current_section is None: + inventory_dict[current_section].append(line) + inv.close() + return inventory_dict + + +# this is the source of truth for cluster names and total number of nodes +def getResizeClusterNames(filepath): + if filepath is None: + out = subprocess.Popen(["ls /opt/oci-hpc/autoscaling/clusters/"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_name_set = set() + for i in range(len(x)): + if x[i] == 'README': + continue + else: + cluster_name_set.add(x[i]) + return cluster_name_set + else: + out = subprocess.Popen(["cat "+filepath],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_name_set = set() + for i in range(len(x)): + cluster_name_set.add(x[i]) + return cluster_name_set + + +# this is the source of truth for total number of nodes in a cluster +def getResizeNodes(metadata, cluster_names): + # total_nodes = 0 + resize_cluster_node_dict = {} + str = "ocid1.instance." + for cluster in cluster_names: + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh --cluster_name "+cluster],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_node_set = set() + for i in range(len(x)): + if str in x[i]: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({cluster: cluster_node_set}) + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh list"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + permanent_cluster = '' + cluster_node_set = set() + for i in range(len(x)): + if str in x[i]: + permanent_cluster = metadata['displayName'].replace('-bastion','') + if permanent_cluster in cluster_names: + return cluster_names, resize_cluster_node_dict + else: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({permanent_cluster: cluster_node_set}) + cluster_names.add(permanent_cluster) + return cluster_names, resize_cluster_node_dict + + +# given a cluster name, return all the nodes in that cluster +def getNodesInClusters(cluster_name): + out = subprocess.Popen(["cat /etc/hosts | grep "+cluster_name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + nodes = set() + x = stdout.split("\n") + for i in range(0,len(x)-1): + nodes.add(x[i]) + return nodes + + +# find out all available clusters +def getEtcClusterNames(): + out = subprocess.Popen(["cat /etc/hosts | grep \"END ANSIBLE MANAGED BLOCK\" | awk '{print $6}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_list = [] + for cluster in x: + if (cluster == "BASTION"): + continue + else: + cluster_list.append(cluster) + return cluster_list + + +def nodesFromEtcHosts(): + etc_cluster_list = getEtcClusterNames() + etc_node_cluster_dict = {} + etc_cluster_node_dict = {} + for etc_cluster in etc_cluster_list: + etc_nodes = getNodesInClusters(etc_cluster) + for n in etc_nodes: + etc_node_cluster_dict.update({n: etc_cluster}) + etc_cluster_node_dict.update({etc_cluster: etc_nodes}) + return etc_node_cluster_dict, etc_cluster_node_dict + + +def getConsoleNodeName(slurm_node_name): + out = subprocess.Popen(["cat /etc/hosts | grep "+slurm_node_name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + node_name_output = stdout.split("\n") + del node_name_output[-1] + return node_name_output[0] + + +# get number of nodes and their state using slurm +def slurmGetNodes(etc_node_cluster_dict, path): + out = subprocess.run(['sinfo','-hNr','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + lines = out.stdout.decode("utf-8") + x = lines.split("\n") + del x[-1] + good_node_states = set() + good_node_states.add("allocated") + good_node_states.add("idle") + warning_node_dict = {} + slurm_node_cluster_dict = {} + for i in range(len(x)): + split_str = x[i].split() + node_state = split_str[0].replace('"','') + node_name = split_str[2].replace('"','') + proper_node_name = getConsoleNodeName(node_name) + if node_state not in good_node_states: + warning_node_dict.update({proper_node_name: node_state}) + slurm_node_cluster = etc_node_cluster_dict[proper_node_name] + slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + return slurm_node_cluster_dict + + +def topologyGetNodes(etc_node_cluster_dict): + str1 = "SwitchName=inactive" + str2 = "Switches=" + out = subprocess.Popen(["cat /etc/slurm/topology.conf"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + topo_node_cluster_dict = {} + for i in range(len(x)): + if str1 in x[i] or str2 in x[i] or x[i].startswith("#"): + continue + else: + split_str = x[i].split() + node_name_str = split_str[1].rsplit("=") + node_name = node_name_str[1] + res = re.findall(r'\[([^]]*)\]', node_name) + if len(res) == 0: + # print(etc_node_cluster_dict) + topo_node_name = getConsoleNodeName(node_name) + topo_node_cluster = etc_node_cluster_dict[topo_node_name] + topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + else: + out = subprocess.Popen(["scontrol show hostnames "+node_name],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + nodes = stdout.split("\n") + del nodes[-1] + for n in nodes: + oci_console_node_name = getConsoleNodeName(n) + topo_node_cluster = etc_node_cluster_dict[oci_console_node_name] + topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + return topo_node_cluster_dict + + +def etcHostsSame(nodes, path): + out = subprocess.Popen(["linecount=`cat /etc/hosts | wc -l ` ; lines=$((linecount-3)) ; tail -n $lines /etc/hosts | md5sum"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + bastion_md5 = x[0].replace('"','') + md5_set = set() + md5_set.add(bastion_md5) + out = subprocess.Popen(["pdsh -w "+nodes+" 'linecount=`cat /etc/hosts | wc -l ` ; lines=$((linecount-3)) ; tail -n $lines /etc/hosts | md5sum'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + for i in range(len(x)): + split_str = x[i].split(':') + md5 = split_str[1].lstrip() + if md5 != bastion_md5: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/etcHosts.txt", "a") + f.write("/etc/hosts file does not match on " + split_str[0] + "\n") + f.close() + md5_set.add(md5) + if len(md5_set) > 1: + print("/etc/hosts on bastion and nodes is different") + else: + print("/etc/hosts is same on bastion and all nodes") + return path + + +def ociCommand(metadata, cluster_names): + comp_ocid=metadata['compartmentId'] + oci_node_cluster_dict = {} + node_list = [] + for cluster in cluster_names: + cn_summary = get_summary(comp_ocid,cluster) + cn_ocid = cn_summary.id + node_list = get_instances(comp_ocid, cn_ocid) + for node in node_list: + oci_node_cluster_dict.update({node: cluster}) + return oci_node_cluster_dict + + +def inventoryNodes(metadata, cluster_names): + # inventory_num_nodes = 0 + inventory_node_cluster_dict = {} + permanent_cluster = metadata['displayName'].replace('-bastion','') + for cluster in cluster_names: + if cluster == permanent_cluster: + inventory = "/etc/ansible/hosts" + inventory_dict = parse_inventory(inventory) + inv_list = inventory_dict["compute_configured"] + for i in inv_list: + split_str = i.split() + node_name = split_str[0] + inventory_node_cluster_dict.update({node_name: cluster}) + else: + inventory = "/opt/oci-hpc/autoscaling/clusters/"+cluster+"/inventory" + inventory_dict = parse_inventory(inventory) + inv_list = inventory_dict["compute_configured"] + for i in inv_list: + split_str = i.split() + node_name = split_str[0] + inventory_node_cluster_dict.update({node_name: cluster}) + return inventory_node_cluster_dict + + +def pcie_check(hostfile, path): + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + +def gpu_throttle(hostfile, path): + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/gpu_throttle.sh\" ; done > "+path+"/gpu-throttle-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + +############### + +parser = argparse.ArgumentParser(description = 'Perform these checks. \ + 1. /etc/hosts is same as bastion across all nodes. \ + 2. Check the number of nodes is consistent across slurm, topology.conf, OCI console, inventory files. \ + 3. PCIe bandwidth check. \ + 4. GPU Throttle check \ + Options: \ + --cluster_names : Give a file that contains all the cluster names for option 2 and this will be considered as source of truth. \ + If not given, then the cluster names in the directory /opt/oci-hpc/autoscaling/clusters/ along with any permanent cluster associated \ + with the bastion will be considered as source of truth. ') +parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize.sh, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ + Also check /etc/hosts is same as bastion across all hosts") +parser.add_argument('-cn', '--cluster_names', help = "Provide a file that contains list of all cluster names") +parser.add_argument('-p', '--pcie_file', help = "Provide a file that contains list of hosts on which to perform pcie check") +parser.add_argument('-g', '--gpu_throttle', help = "Provide a file that contains list of hosts on which to perform gpu throttle check") + +args = parser.parse_args() + +metadata=get_metadata() + +if args.num_nodes is not None: + cluster_names = getResizeClusterNames(args.cluster_names) + resize_cluster_names, resize_cluster_node_dict = getResizeNodes(metadata, cluster_names) + # print(resize_cluster_node_dict) + + +resize_node_cluster_dict = {} +for k, v in resize_cluster_node_dict.items(): + for v1 in v: + resize_node_cluster_dict[v1] = k +# print(resize_node_cluster_dict) + +etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts() + +path = createDir() +changeOwner(path) + +slurm_node_cluster_dict = slurmGetNodes(etc_node_cluster_dict, path) + +if resize_node_cluster_dict == etc_node_cluster_dict: + print("Number of nodes in /etc/hosts on bastion is same as resize") +else: + f = open(path+"/etcHostsNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in etc_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in etc_node_cluster_dict: + f.write(key + " not in /etc/hosts" + "\n") + # Finding keys in etc_node_cluster_dict which are not in resize_node_cluster_dict + for key in etc_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + +if resize_node_cluster_dict == slurm_node_cluster_dict: + print("Number of nodes from slurm is same as resize") +else: + f = open(path+"/slurmNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in slurm_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in slurm_node_cluster_dict: + f.write(key + " not in slurm" + "\n") + # Finding keys in slurm_node_cluster_dict which are not in resize_node_cluster_dict + for key in slurm_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + +topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) + +if resize_node_cluster_dict == topo_node_cluster_dict: + print("Number of nodes from topology is same as resize") +else: + f = open(path+"/topoNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in topo_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in topo_node_cluster_dict: + f.write(key + " not in topology.conf" + "\n") + # Finding keys in topo_node_cluster_dict which are not in resize_node_cluster_dict + for key in topo_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + +inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) + +if resize_node_cluster_dict == inventory_node_cluster_dict: + print("Number of nodes from inventory is same as resize") +else: + f = open(path+"/inventoryNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in inventory_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in inventory_node_cluster_dict: + f.write(key + " not in inventory file" + "\n") + # Finding keys in inventory_node_cluster_dict which are not in resize_node_cluster_dict + for key in inventory_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + +oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) +if resize_node_cluster_dict == oci_node_cluster_dict: + print("Number of nodes from oci cli is same as resize") +else: + f = open(path+"/ociCliNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in oci_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in oci_node_cluster_dict: + f.write(key + " not in oci cli" + "\n") + # Finding keys in oci_node_cluster_dict which are not in resize_node_cluster_dict + for key in oci_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + + +if args.pcie_file is not None: + pcie_hostfile = args.pcie_file + pcie_check(pcie_hostfile, path) + +if args.gpu_throttle is not None: + gpu_hostfile = args.gpu_throttle + gpu_throttle(gpu_hostfile, path) + +if path is not None: + print(f"Output is in folder: {path}") + diff --git a/bin/pcie.sh b/bin/pcie.sh new file mode 100644 index 00000000..f15061ff --- /dev/null +++ b/bin/pcie.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for dev in `/usr/sbin/lspci | grep ConnectX-5 | awk '{print $1}'` +do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: +done + From 977a7a1f36ace0f92f40e2060c880a2d11df7fcc Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 13 Jan 2023 16:29:33 -0800 Subject: [PATCH 018/133] added error handling --- bin/num_nodes_same.py | 216 ++++++++++++++++++++++-------------------- 1 file changed, 113 insertions(+), 103 deletions(-) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index 907c9a1a..c15e5a62 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -60,7 +60,7 @@ def get_metadata(): def get_summary(comp_ocid,cluster_name): - print(cluster_name) + # print(cluster_name) signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data @@ -70,7 +70,7 @@ def get_summary(comp_ocid,cluster_name): for cn_summary_tmp in cn_summaries: if cn_summary_tmp.lifecycle_state == "RUNNING": cn_summary = cn_summary_tmp - print(cn_summary) + # print(cn_summary) running_clusters = running_clusters + 1 elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 @@ -78,8 +78,8 @@ def get_summary(comp_ocid,cluster_name): print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) if scaling_clusters > 0: print("The cluster " +cluster_name+ " is scaling. Run this validation after it finishes scaling.") - print(cluster_name) - print(cn_summary) + # print(cluster_name) + # print(cn_summary) return cn_summary @@ -221,7 +221,7 @@ def getConsoleNodeName(slurm_node_name): # get number of nodes and their state using slurm -def slurmGetNodes(etc_node_cluster_dict, path): +def slurmGetNodes(etc_node_cluster_dict): out = subprocess.run(['sinfo','-hNr','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) lines = out.stdout.decode("utf-8") x = lines.split("\n") @@ -238,9 +238,12 @@ def slurmGetNodes(etc_node_cluster_dict, path): proper_node_name = getConsoleNodeName(node_name) if node_state not in good_node_states: warning_node_dict.update({proper_node_name: node_state}) - slurm_node_cluster = etc_node_cluster_dict[proper_node_name] - slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) - return slurm_node_cluster_dict + if proper_node_name in etc_node_cluster_dict: + slurm_node_cluster = etc_node_cluster_dict[proper_node_name] + slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + else: + print(proper_node_name + " not found in /etc/hosts file") + return slurm_node_cluster_dict, warning_node_dict def topologyGetNodes(etc_node_cluster_dict): @@ -260,10 +263,12 @@ def topologyGetNodes(etc_node_cluster_dict): node_name = node_name_str[1] res = re.findall(r'\[([^]]*)\]', node_name) if len(res) == 0: - # print(etc_node_cluster_dict) topo_node_name = getConsoleNodeName(node_name) - topo_node_cluster = etc_node_cluster_dict[topo_node_name] - topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + if topo_node_name in etc_node_cluster_dict: + topo_node_cluster = etc_node_cluster_dict[topo_node_name] + topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + else: + print(topo_node_name + " not found in /etc/hosts file") else: out = subprocess.Popen(["scontrol show hostnames "+node_name],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() @@ -271,8 +276,11 @@ def topologyGetNodes(etc_node_cluster_dict): del nodes[-1] for n in nodes: oci_console_node_name = getConsoleNodeName(n) - topo_node_cluster = etc_node_cluster_dict[oci_console_node_name] - topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + if oci_console_node_name in etc_cluster_node_dict: + topo_node_cluster = etc_node_cluster_dict[oci_console_node_name] + topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + else: + print(oci_console_node_name + " not found in /etc/hosts file") return topo_node_cluster_dict @@ -312,15 +320,15 @@ def ociCommand(metadata, cluster_names): node_list = [] for cluster in cluster_names: cn_summary = get_summary(comp_ocid,cluster) - cn_ocid = cn_summary.id - node_list = get_instances(comp_ocid, cn_ocid) - for node in node_list: - oci_node_cluster_dict.update({node: cluster}) + if cn_summary is not None: + cn_ocid = cn_summary.id + node_list = get_instances(comp_ocid, cn_ocid) + for node in node_list: + oci_node_cluster_dict.update({node: cluster}) return oci_node_cluster_dict def inventoryNodes(metadata, cluster_names): - # inventory_num_nodes = 0 inventory_node_cluster_dict = {} permanent_cluster = metadata['displayName'].replace('-bastion','') for cluster in cluster_names: @@ -376,99 +384,101 @@ def gpu_throttle(hostfile, path): metadata=get_metadata() +path = createDir() +changeOwner(path) + if args.num_nodes is not None: + resize_cluster_names = [] + resize_cluster_node_dict = {} cluster_names = getResizeClusterNames(args.cluster_names) resize_cluster_names, resize_cluster_node_dict = getResizeNodes(metadata, cluster_names) - # print(resize_cluster_node_dict) + if len(resize_cluster_names) == 0 or len(resize_cluster_node_dict) == 0: + print("There are no clusters available") + else: + resize_node_cluster_dict = {} + for k, v in resize_cluster_node_dict.items(): + for v1 in v: + resize_node_cluster_dict[v1] = k -resize_node_cluster_dict = {} -for k, v in resize_cluster_node_dict.items(): - for v1 in v: - resize_node_cluster_dict[v1] = k -# print(resize_node_cluster_dict) + etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts() -etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts() + slurm_node_cluster_dict, warning_node_dict = slurmGetNodes(etc_node_cluster_dict) -path = createDir() -changeOwner(path) + if resize_node_cluster_dict == etc_node_cluster_dict: + print("Number of nodes in /etc/hosts on bastion is same as resize") + else: + f = open(path+"/etcHostsNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in etc_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in etc_node_cluster_dict: + f.write(key + " not in /etc/hosts" + "\n") + # Finding keys in etc_node_cluster_dict which are not in resize_node_cluster_dict + for key in etc_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() -slurm_node_cluster_dict = slurmGetNodes(etc_node_cluster_dict, path) - -if resize_node_cluster_dict == etc_node_cluster_dict: - print("Number of nodes in /etc/hosts on bastion is same as resize") -else: - f = open(path+"/etcHostsNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in etc_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in etc_node_cluster_dict: - f.write(key + " not in /etc/hosts" + "\n") - # Finding keys in etc_node_cluster_dict which are not in resize_node_cluster_dict - for key in etc_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() - -if resize_node_cluster_dict == slurm_node_cluster_dict: - print("Number of nodes from slurm is same as resize") -else: - f = open(path+"/slurmNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in slurm_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in slurm_node_cluster_dict: - f.write(key + " not in slurm" + "\n") - # Finding keys in slurm_node_cluster_dict which are not in resize_node_cluster_dict - for key in slurm_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() - -topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) - -if resize_node_cluster_dict == topo_node_cluster_dict: - print("Number of nodes from topology is same as resize") -else: - f = open(path+"/topoNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in topo_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in topo_node_cluster_dict: - f.write(key + " not in topology.conf" + "\n") - # Finding keys in topo_node_cluster_dict which are not in resize_node_cluster_dict - for key in topo_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() - -inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) - -if resize_node_cluster_dict == inventory_node_cluster_dict: - print("Number of nodes from inventory is same as resize") -else: - f = open(path+"/inventoryNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in inventory_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in inventory_node_cluster_dict: - f.write(key + " not in inventory file" + "\n") - # Finding keys in inventory_node_cluster_dict which are not in resize_node_cluster_dict - for key in inventory_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() - -oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) -if resize_node_cluster_dict == oci_node_cluster_dict: - print("Number of nodes from oci cli is same as resize") -else: - f = open(path+"/ociCliNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in oci_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in oci_node_cluster_dict: - f.write(key + " not in oci cli" + "\n") - # Finding keys in oci_node_cluster_dict which are not in resize_node_cluster_dict - for key in oci_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if resize_node_cluster_dict == slurm_node_cluster_dict: + print("Number of nodes from slurm is same as resize") + else: + f = open(path+"/slurmNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in slurm_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in slurm_node_cluster_dict: + f.write(key + " not in slurm" + "\n") + # Finding keys in slurm_node_cluster_dict which are not in resize_node_cluster_dict + for key in slurm_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + + topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) + + if resize_node_cluster_dict == topo_node_cluster_dict: + print("Number of nodes from topology is same as resize") + else: + f = open(path+"/topoNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in topo_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in topo_node_cluster_dict: + f.write(key + " not in topology.conf" + "\n") + # Finding keys in topo_node_cluster_dict which are not in resize_node_cluster_dict + for key in topo_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + + inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) + + if resize_node_cluster_dict == inventory_node_cluster_dict: + print("Number of nodes from inventory is same as resize") + else: + f = open(path+"/inventoryNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in inventory_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in inventory_node_cluster_dict: + f.write(key + " not in inventory file" + "\n") + # Finding keys in inventory_node_cluster_dict which are not in resize_node_cluster_dict + for key in inventory_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() + + oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) + if resize_node_cluster_dict == oci_node_cluster_dict: + print("Number of nodes from oci cli is same as resize") + else: + f = open(path+"/ociCliNumNodes.txt", "a") + # Finding keys in resize_node_cluster_dict which are not in oci_node_cluster_dict + for key in resize_node_cluster_dict.keys(): + if not key in oci_node_cluster_dict: + f.write(key + " not in oci cli" + "\n") + # Finding keys in oci_node_cluster_dict which are not in resize_node_cluster_dict + for key in oci_node_cluster_dict.keys(): + if not key in resize_node_cluster_dict: + f.write(key + " not in resize list" + "\n") + f.close() if args.pcie_file is not None: From 7268da03359ad58a07a8735ebd10e003e7337040 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 13 Jan 2023 18:50:35 -0800 Subject: [PATCH 019/133] added etcHosts md5sum and handle instance pools --- bin/num_nodes_same.py | 153 +++++++++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 45 deletions(-) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index c15e5a62..9a2cedfd 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -60,9 +60,30 @@ def get_metadata(): def get_summary(comp_ocid,cluster_name): - # print(cluster_name) + # # print(cluster_name) + # signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() + # computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + # cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data + # running_clusters = 0 + # scaling_clusters = 0 + # cn_summary=None + # for cn_summary_tmp in cn_summaries: + # if cn_summary_tmp.lifecycle_state == "RUNNING": + # cn_summary = cn_summary_tmp + # # print(cn_summary) + # running_clusters = running_clusters + 1 + # elif cn_summary_tmp.lifecycle_state == "SCALING": + # scaling_clusters = scaling_clusters + 1 + # if running_clusters > 1: + # print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) + # if scaling_clusters > 0: + # print("The cluster " +cluster_name+ " is scaling. Run this validation after it finishes scaling.") + # # print(cluster_name) + # # print(cn_summary) + # return cn_summary signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + CN = True cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data running_clusters = 0 scaling_clusters = 0 @@ -70,17 +91,35 @@ def get_summary(comp_ocid,cluster_name): for cn_summary_tmp in cn_summaries: if cn_summary_tmp.lifecycle_state == "RUNNING": cn_summary = cn_summary_tmp - # print(cn_summary) running_clusters = running_clusters + 1 elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 + if running_clusters == 0: + cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data + if len(cn_summaries) > 0: + CN = False + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 + if running_clusters == 0: + if scaling_clusters: + print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") + else: + print("The cluster was not found") + return None,None,True if running_clusters > 1: print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) - if scaling_clusters > 0: - print("The cluster " +cluster_name+ " is scaling. Run this validation after it finishes scaling.") - # print(cluster_name) + if CN: + ip_summary=cn_summary.instance_pools[0] + else: + ip_summary=cn_summary + # print(CN) + # print(ip_summary) # print(cn_summary) - return cn_summary + return cn_summary,ip_summary,CN def get_instances(comp_ocid,cn_ocid): @@ -296,22 +335,23 @@ def etcHostsSame(nodes, path): stdout,stderr = out.communicate() x = stdout.split("\n") del x[-1] + str = "exit" + f = open(path+"/etcHostsMD5Sum.txt", "a") for i in range(len(x)): split_str = x[i].split(':') - md5 = split_str[1].lstrip() - if md5 != bastion_md5: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/etcHosts.txt", "a") - f.write("/etc/hosts file does not match on " + split_str[0] + "\n") - f.close() - md5_set.add(md5) + if str in x[i]: + f.write(split_str[1] + " not ssh-able at the moment" + "\n") + continue + else: + md5 = split_str[1].lstrip() + if md5 != bastion_md5: + f.write("/etc/hosts file does not match on " + split_str[0] + "\n") + md5_set.add(md5) + f.close() if len(md5_set) > 1: print("/etc/hosts on bastion and nodes is different") else: - print("/etc/hosts is same on bastion and all nodes") - return path + print("/etc/hosts is same on bastion and all nodes that are ssh-able") def ociCommand(metadata, cluster_names): @@ -319,12 +359,17 @@ def ociCommand(metadata, cluster_names): oci_node_cluster_dict = {} node_list = [] for cluster in cluster_names: - cn_summary = get_summary(comp_ocid,cluster) + cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster) if cn_summary is not None: cn_ocid = cn_summary.id node_list = get_instances(comp_ocid, cn_ocid) for node in node_list: oci_node_cluster_dict.update({node: cluster}) + elif ip_summary is not None: + cn_ocid = ip_summary.id + node_list = get_instances(comp_ocid, cn_ocid) + for node in node_list: + oci_node_cluster_dict.update({node: cluster}) return oci_node_cluster_dict @@ -366,17 +411,16 @@ def gpu_throttle(hostfile, path): ############### parser = argparse.ArgumentParser(description = 'Perform these checks. \ - 1. /etc/hosts is same as bastion across all nodes. \ - 2. Check the number of nodes is consistent across slurm, topology.conf, OCI console, inventory files. \ - 3. PCIe bandwidth check. \ - 4. GPU Throttle check \ + 1. Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ + 2. PCIe bandwidth check. \ + 3. GPU Throttle check \ Options: \ - --cluster_names : Give a file that contains all the cluster names for option 2 and this will be considered as source of truth. \ + --cluster_names : Give a file that contains all the cluster names for option 1 and this will be considered as source of truth. \ If not given, then the cluster names in the directory /opt/oci-hpc/autoscaling/clusters/ along with any permanent cluster associated \ with the bastion will be considered as source of truth. ') parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize.sh, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ Also check /etc/hosts is same as bastion across all hosts") -parser.add_argument('-cn', '--cluster_names', help = "Provide a file that contains list of all cluster names") +parser.add_argument('-cn', '--cluster_names', help = "Provide a file that contains list of all cluster names for the above validation") parser.add_argument('-p', '--pcie_file', help = "Provide a file that contains list of hosts on which to perform pcie check") parser.add_argument('-g', '--gpu_throttle', help = "Provide a file that contains list of hosts on which to perform gpu throttle check") @@ -405,14 +449,21 @@ def gpu_throttle(hostfile, path): slurm_node_cluster_dict, warning_node_dict = slurmGetNodes(etc_node_cluster_dict) + topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) + + inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) + + oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) + if resize_node_cluster_dict == etc_node_cluster_dict: print("Number of nodes in /etc/hosts on bastion is same as resize") else: - f = open(path+"/etcHostsNumNodes.txt", "a") + f = open(path+"/etcHostsNumNodes.txt", "a") # Finding keys in resize_node_cluster_dict which are not in etc_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in etc_node_cluster_dict: - f.write(key + " not in /etc/hosts" + "\n") + # commenting this for loop for now as we want those nodes that are there in etc hosts but not in resize as resize is the source of truth + # for key in resize_node_cluster_dict.keys(): + # if not key in etc_node_cluster_dict: + # f.write(key + " not in /etc/hosts" + "\n") # Finding keys in etc_node_cluster_dict which are not in resize_node_cluster_dict for key in etc_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: @@ -422,64 +473,76 @@ def gpu_throttle(hostfile, path): if resize_node_cluster_dict == slurm_node_cluster_dict: print("Number of nodes from slurm is same as resize") else: - f = open(path+"/slurmNumNodes.txt", "a") + f = open(path+"/slurmNumNodes.txt", "a") # Finding keys in resize_node_cluster_dict which are not in slurm_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in slurm_node_cluster_dict: - f.write(key + " not in slurm" + "\n") + # commenting this for loop for now as we want those nodes that are there in slurm but not in resize as resize is the source of truth + # for key in resize_node_cluster_dict.keys(): + # if not key in slurm_node_cluster_dict: + # f.write(key + " not in slurm" + "\n") # Finding keys in slurm_node_cluster_dict which are not in resize_node_cluster_dict for key in slurm_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") f.close() - topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) + if len(warning_node_dict) > 0: + f = open(path+"/slurmWarnNodes.txt", "a") + for key in warning_node_dict.keys(): + f.write(key + " is is slurm state " + warning_node_dict[key] + "\n") + f.close() if resize_node_cluster_dict == topo_node_cluster_dict: print("Number of nodes from topology is same as resize") else: f = open(path+"/topoNumNodes.txt", "a") # Finding keys in resize_node_cluster_dict which are not in topo_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in topo_node_cluster_dict: - f.write(key + " not in topology.conf" + "\n") + # commenting this for loop for now as we want those nodes that are there in topology.conf but not in resize as resize is the source of truth + # for key in resize_node_cluster_dict.keys(): + # if not key in topo_node_cluster_dict: + # f.write(key + " not in topology.conf" + "\n") # Finding keys in topo_node_cluster_dict which are not in resize_node_cluster_dict for key in topo_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") f.close() - inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) - if resize_node_cluster_dict == inventory_node_cluster_dict: print("Number of nodes from inventory is same as resize") else: f = open(path+"/inventoryNumNodes.txt", "a") # Finding keys in resize_node_cluster_dict which are not in inventory_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in inventory_node_cluster_dict: - f.write(key + " not in inventory file" + "\n") + # commenting this for loop for now as we want those nodes that are there in inventory but not in resize as resize is the source of truth + # for key in resize_node_cluster_dict.keys(): + # if not key in inventory_node_cluster_dict: + # f.write(key + " not in inventory file" + "\n") # Finding keys in inventory_node_cluster_dict which are not in resize_node_cluster_dict for key in inventory_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") f.close() - oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) if resize_node_cluster_dict == oci_node_cluster_dict: print("Number of nodes from oci cli is same as resize") else: f = open(path+"/ociCliNumNodes.txt", "a") # Finding keys in resize_node_cluster_dict which are not in oci_node_cluster_dict - for key in resize_node_cluster_dict.keys(): - if not key in oci_node_cluster_dict: - f.write(key + " not in oci cli" + "\n") + # commenting this for loop for now as we want those nodes that are there in oci cli but not in resize as resize is the source of truth + # for key in resize_node_cluster_dict.keys(): + # if not key in oci_node_cluster_dict: + # f.write(key + " not in oci cli" + "\n") # Finding keys in oci_node_cluster_dict which are not in resize_node_cluster_dict for key in oci_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") f.close() + node_list = list(map(' '.join, resize_cluster_node_dict.values())) + nodes_space = ' '.join(str(s) for s in node_list) + split_str = nodes_space.split() + nodes_comma = ','.join(str(s) for s in split_str) + print(nodes_comma) + etcHostsSame(nodes_comma, path) + if args.pcie_file is not None: pcie_hostfile = args.pcie_file From 5f28bd656d0e4da5f3afc8305606af49da849d97 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 13 Jan 2023 20:32:26 -0800 Subject: [PATCH 020/133] add etc hosts md5 sum check standalone --- bin/num_nodes_same.py | 87 ++++++++++++------------------------------- 1 file changed, 24 insertions(+), 63 deletions(-) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index 9a2cedfd..f7d667d5 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -60,27 +60,6 @@ def get_metadata(): def get_summary(comp_ocid,cluster_name): - # # print(cluster_name) - # signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() - # computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) - # cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data - # running_clusters = 0 - # scaling_clusters = 0 - # cn_summary=None - # for cn_summary_tmp in cn_summaries: - # if cn_summary_tmp.lifecycle_state == "RUNNING": - # cn_summary = cn_summary_tmp - # # print(cn_summary) - # running_clusters = running_clusters + 1 - # elif cn_summary_tmp.lifecycle_state == "SCALING": - # scaling_clusters = scaling_clusters + 1 - # if running_clusters > 1: - # print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) - # if scaling_clusters > 0: - # print("The cluster " +cluster_name+ " is scaling. Run this validation after it finishes scaling.") - # # print(cluster_name) - # # print(cn_summary) - # return cn_summary signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) CN = True @@ -116,9 +95,6 @@ def get_summary(comp_ocid,cluster_name): ip_summary=cn_summary.instance_pools[0] else: ip_summary=cn_summary - # print(CN) - # print(ip_summary) - # print(cn_summary) return cn_summary,ip_summary,CN @@ -178,7 +154,6 @@ def getResizeClusterNames(filepath): # this is the source of truth for total number of nodes in a cluster def getResizeNodes(metadata, cluster_names): - # total_nodes = 0 resize_cluster_node_dict = {} str = "ocid1.instance." for cluster in cluster_names: @@ -281,7 +256,9 @@ def slurmGetNodes(etc_node_cluster_dict): slurm_node_cluster = etc_node_cluster_dict[proper_node_name] slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) else: - print(proper_node_name + " not found in /etc/hosts file") + f = open(path+"/slurmNumNodes.txt", "a") + f.write(proper_node_name + " not found in /etc/hosts file" + "\n") + f.close() return slurm_node_cluster_dict, warning_node_dict @@ -307,7 +284,9 @@ def topologyGetNodes(etc_node_cluster_dict): topo_node_cluster = etc_node_cluster_dict[topo_node_name] topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) else: - print(topo_node_name + " not found in /etc/hosts file") + f = open(path+"/topoNumNodes.txt", "a") + f.write(topo_node_name + " not found in /etc/hosts file" + "\n") + f.close() else: out = subprocess.Popen(["scontrol show hostnames "+node_name],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() @@ -315,11 +294,13 @@ def topologyGetNodes(etc_node_cluster_dict): del nodes[-1] for n in nodes: oci_console_node_name = getConsoleNodeName(n) - if oci_console_node_name in etc_cluster_node_dict: + if oci_console_node_name in etc_node_cluster_dict: topo_node_cluster = etc_node_cluster_dict[oci_console_node_name] topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) else: - print(oci_console_node_name + " not found in /etc/hosts file") + f = open(path+"/topoNumNodes.txt", "a") + f.write(oci_console_node_name + " not found in /etc/hosts file" + "\n") + f.close() return topo_node_cluster_dict @@ -336,7 +317,6 @@ def etcHostsSame(nodes, path): x = stdout.split("\n") del x[-1] str = "exit" - f = open(path+"/etcHostsMD5Sum.txt", "a") for i in range(len(x)): split_str = x[i].split(':') if str in x[i]: @@ -345,9 +325,10 @@ def etcHostsSame(nodes, path): else: md5 = split_str[1].lstrip() if md5 != bastion_md5: + f = open(path+"/etcHostsMD5Sum.txt", "a") f.write("/etc/hosts file does not match on " + split_str[0] + "\n") + f.close() md5_set.add(md5) - f.close() if len(md5_set) > 1: print("/etc/hosts on bastion and nodes is different") else: @@ -414,15 +395,18 @@ def gpu_throttle(hostfile, path): 1. Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ 2. PCIe bandwidth check. \ 3. GPU Throttle check \ + 4. Standalone /etc/hosts md5 sum validation \ Options: \ --cluster_names : Give a file that contains all the cluster names for option 1 and this will be considered as source of truth. \ If not given, then the cluster names in the directory /opt/oci-hpc/autoscaling/clusters/ along with any permanent cluster associated \ with the bastion will be considered as source of truth. ') parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize.sh, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ - Also check /etc/hosts is same as bastion across all hosts") + Also check /etc/hosts is same as bastion across all hosts. If -cn option is provided along with this, then that file will be considered. If not, nodes \ + resize will be considered. ") parser.add_argument('-cn', '--cluster_names', help = "Provide a file that contains list of all cluster names for the above validation") parser.add_argument('-p', '--pcie_file', help = "Provide a file that contains list of hosts on which to perform pcie check") parser.add_argument('-g', '--gpu_throttle', help = "Provide a file that contains list of hosts on which to perform gpu throttle check") +parser.add_argument('-e', '--etc_hosts', help = "Provide a file that contains list of hosts on which to perform md5 sum check to match with bastion") args = parser.parse_args() @@ -459,12 +443,6 @@ def gpu_throttle(hostfile, path): print("Number of nodes in /etc/hosts on bastion is same as resize") else: f = open(path+"/etcHostsNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in etc_node_cluster_dict - # commenting this for loop for now as we want those nodes that are there in etc hosts but not in resize as resize is the source of truth - # for key in resize_node_cluster_dict.keys(): - # if not key in etc_node_cluster_dict: - # f.write(key + " not in /etc/hosts" + "\n") - # Finding keys in etc_node_cluster_dict which are not in resize_node_cluster_dict for key in etc_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") @@ -474,12 +452,6 @@ def gpu_throttle(hostfile, path): print("Number of nodes from slurm is same as resize") else: f = open(path+"/slurmNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in slurm_node_cluster_dict - # commenting this for loop for now as we want those nodes that are there in slurm but not in resize as resize is the source of truth - # for key in resize_node_cluster_dict.keys(): - # if not key in slurm_node_cluster_dict: - # f.write(key + " not in slurm" + "\n") - # Finding keys in slurm_node_cluster_dict which are not in resize_node_cluster_dict for key in slurm_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") @@ -495,12 +467,6 @@ def gpu_throttle(hostfile, path): print("Number of nodes from topology is same as resize") else: f = open(path+"/topoNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in topo_node_cluster_dict - # commenting this for loop for now as we want those nodes that are there in topology.conf but not in resize as resize is the source of truth - # for key in resize_node_cluster_dict.keys(): - # if not key in topo_node_cluster_dict: - # f.write(key + " not in topology.conf" + "\n") - # Finding keys in topo_node_cluster_dict which are not in resize_node_cluster_dict for key in topo_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") @@ -510,12 +476,6 @@ def gpu_throttle(hostfile, path): print("Number of nodes from inventory is same as resize") else: f = open(path+"/inventoryNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in inventory_node_cluster_dict - # commenting this for loop for now as we want those nodes that are there in inventory but not in resize as resize is the source of truth - # for key in resize_node_cluster_dict.keys(): - # if not key in inventory_node_cluster_dict: - # f.write(key + " not in inventory file" + "\n") - # Finding keys in inventory_node_cluster_dict which are not in resize_node_cluster_dict for key in inventory_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") @@ -525,12 +485,6 @@ def gpu_throttle(hostfile, path): print("Number of nodes from oci cli is same as resize") else: f = open(path+"/ociCliNumNodes.txt", "a") - # Finding keys in resize_node_cluster_dict which are not in oci_node_cluster_dict - # commenting this for loop for now as we want those nodes that are there in oci cli but not in resize as resize is the source of truth - # for key in resize_node_cluster_dict.keys(): - # if not key in oci_node_cluster_dict: - # f.write(key + " not in oci cli" + "\n") - # Finding keys in oci_node_cluster_dict which are not in resize_node_cluster_dict for key in oci_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: f.write(key + " not in resize list" + "\n") @@ -540,9 +494,16 @@ def gpu_throttle(hostfile, path): nodes_space = ' '.join(str(s) for s in node_list) split_str = nodes_space.split() nodes_comma = ','.join(str(s) for s in split_str) - print(nodes_comma) etcHostsSame(nodes_comma, path) +if args.num_nodes is None and args.etc_hosts is not None: + hostfile = args.etc_hosts + out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + nodes_comma = ','.join(str(s) for s in x) + etcHostsSame(nodes_comma, path) if args.pcie_file is not None: pcie_hostfile = args.pcie_file From 23ccc65339440dd84d37c37e13e359b7ddd3168a Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 16 Jan 2023 13:33:10 -0800 Subject: [PATCH 021/133] fixed file open and close statements --- bin/num_nodes_same.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index f7d667d5..0019a9da 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -320,7 +320,9 @@ def etcHostsSame(nodes, path): for i in range(len(x)): split_str = x[i].split(':') if str in x[i]: + f = open(path+"/etcHostsMD5Sum.txt", "a") f.write(split_str[1] + " not ssh-able at the moment" + "\n") + f.close() continue else: md5 = split_str[1].lstrip() From ac504e1f453fcf23e4582002757b172ec367d6e8 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 16 Jan 2023 20:46:57 -0800 Subject: [PATCH 022/133] fixed issue where grep from /etc/hosts was giving multiple names, added logic to get nodes from cluster names if provided for pcie and gpu --- bin/num_nodes_same.py | 374 ++++++++++++++++++++++++++++++------------ 1 file changed, 266 insertions(+), 108 deletions(-) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index 0019a9da..484218d7 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -153,38 +153,40 @@ def getResizeClusterNames(filepath): # this is the source of truth for total number of nodes in a cluster -def getResizeNodes(metadata, cluster_names): - resize_cluster_node_dict = {} - str = "ocid1.instance." - for cluster in cluster_names: - out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh --cluster_name "+cluster],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) +def getResizeNodes(metadata, cluster_names, mode): + if mode == 1 or mode == 2: + resize_cluster_node_dict = {} + str = "ocid1.instance." + for cluster in cluster_names: + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh --cluster_name "+cluster],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_node_set = set() + for i in range(len(x)): + if str in x[i]: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({cluster: cluster_node_set}) + if mode == 2: + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh list"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() x = stdout.split("\n") del x[-1] + permanent_cluster = '' cluster_node_set = set() for i in range(len(x)): if str in x[i]: - split_str = x[i].split() - cluster_node_set.add(split_str[0].replace('"','')) - if len(cluster_node_set) > 0: - resize_cluster_node_dict.update({cluster: cluster_node_set}) - out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh list"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - x = stdout.split("\n") - del x[-1] - permanent_cluster = '' - cluster_node_set = set() - for i in range(len(x)): - if str in x[i]: - permanent_cluster = metadata['displayName'].replace('-bastion','') - if permanent_cluster in cluster_names: - return cluster_names, resize_cluster_node_dict - else: - split_str = x[i].split() - cluster_node_set.add(split_str[0].replace('"','')) - if len(cluster_node_set) > 0: - resize_cluster_node_dict.update({permanent_cluster: cluster_node_set}) - cluster_names.add(permanent_cluster) + permanent_cluster = metadata['displayName'].replace('-bastion','') + if permanent_cluster in cluster_names: + return cluster_names, resize_cluster_node_dict + else: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({permanent_cluster: cluster_node_set}) + cluster_names.add(permanent_cluster) return cluster_names, resize_cluster_node_dict @@ -199,35 +201,20 @@ def getNodesInClusters(cluster_name): return nodes -# find out all available clusters -def getEtcClusterNames(): - out = subprocess.Popen(["cat /etc/hosts | grep \"END ANSIBLE MANAGED BLOCK\" | awk '{print $6}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - x = stdout.split("\n") - del x[-1] - cluster_list = [] - for cluster in x: - if (cluster == "BASTION"): - continue - else: - cluster_list.append(cluster) - return cluster_list - - -def nodesFromEtcHosts(): - etc_cluster_list = getEtcClusterNames() +def nodesFromEtcHosts(resize_cluster_names): etc_node_cluster_dict = {} etc_cluster_node_dict = {} - for etc_cluster in etc_cluster_list: - etc_nodes = getNodesInClusters(etc_cluster) + for cluster in resize_cluster_names: + etc_nodes = getNodesInClusters(cluster) for n in etc_nodes: - etc_node_cluster_dict.update({n: etc_cluster}) - etc_cluster_node_dict.update({etc_cluster: etc_nodes}) + etc_node_cluster_dict.update({n: cluster}) + etc_cluster_node_dict.update({cluster: etc_nodes}) return etc_node_cluster_dict, etc_cluster_node_dict def getConsoleNodeName(slurm_node_name): - out = subprocess.Popen(["cat /etc/hosts | grep "+slurm_node_name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + name = slurm_node_name + ".local.vcn" + out = subprocess.Popen(["cat /etc/hosts | grep "+name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() node_name_output = stdout.split("\n") del node_name_output[-1] @@ -235,7 +222,7 @@ def getConsoleNodeName(slurm_node_name): # get number of nodes and their state using slurm -def slurmGetNodes(etc_node_cluster_dict): +def slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path): out = subprocess.run(['sinfo','-hNr','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) lines = out.stdout.decode("utf-8") x = lines.split("\n") @@ -250,19 +237,31 @@ def slurmGetNodes(etc_node_cluster_dict): node_state = split_str[0].replace('"','') node_name = split_str[2].replace('"','') proper_node_name = getConsoleNodeName(node_name) - if node_state not in good_node_states: - warning_node_dict.update({proper_node_name: node_state}) - if proper_node_name in etc_node_cluster_dict: - slurm_node_cluster = etc_node_cluster_dict[proper_node_name] - slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + if proper_node_name is not None: + if node_state not in good_node_states: + warning_node_dict.update({proper_node_name: node_state}) + if proper_node_name in all_node_cluster_dict: + slurm_node_cluster = all_node_cluster_dict[proper_node_name] + if slurm_node_cluster in resize_cluster_names: + slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmNumNodes.txt", "a") + f.write(proper_node_name + " not found in resize" + "\n") + f.close() else: + if path is None: + path = createDir() + changeOwner(path) f = open(path+"/slurmNumNodes.txt", "a") - f.write(proper_node_name + " not found in /etc/hosts file" + "\n") + f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") f.close() - return slurm_node_cluster_dict, warning_node_dict + return slurm_node_cluster_dict, warning_node_dict, path -def topologyGetNodes(etc_node_cluster_dict): +def topologyGetNodes(resize_cluster_names, all_node_cluster_dict, path): str1 = "SwitchName=inactive" str2 = "Switches=" out = subprocess.Popen(["cat /etc/slurm/topology.conf"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) @@ -276,16 +275,29 @@ def topologyGetNodes(etc_node_cluster_dict): else: split_str = x[i].split() node_name_str = split_str[1].rsplit("=") - node_name = node_name_str[1] + node_name_1 = node_name_str[1].replace('"','') + node_name = node_name_1.replace(' ','') res = re.findall(r'\[([^]]*)\]', node_name) if len(res) == 0: topo_node_name = getConsoleNodeName(node_name) - if topo_node_name in etc_node_cluster_dict: - topo_node_cluster = etc_node_cluster_dict[topo_node_name] - topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + if topo_node_name is not None: + if topo_node_name in all_node_cluster_dict: + topo_node_cluster = all_node_cluster_dict[topo_node_name] + if topo_node_cluster in resize_cluster_names: + topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(topo_node_name + " not found in resize" + "\n") + f.close() else: + if path is None: + path = createDir() + changeOwner(path) f = open(path+"/topoNumNodes.txt", "a") - f.write(topo_node_name + " not found in /etc/hosts file" + "\n") + f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") f.close() else: out = subprocess.Popen(["scontrol show hostnames "+node_name],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) @@ -294,14 +306,26 @@ def topologyGetNodes(etc_node_cluster_dict): del nodes[-1] for n in nodes: oci_console_node_name = getConsoleNodeName(n) - if oci_console_node_name in etc_node_cluster_dict: - topo_node_cluster = etc_node_cluster_dict[oci_console_node_name] - topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + if oci_console_node_name is not None: + if oci_console_node_name in all_node_cluster_dict: + topo_node_cluster = all_node_cluster_dict[oci_console_node_name] + if topo_node_cluster in resize_cluster_names: + topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(oci_console_node_name + " not found in resize" + "\n") + f.close() else: + if path is None: + path = createDir() + changeOwner(path) f = open(path+"/topoNumNodes.txt", "a") - f.write(oci_console_node_name + " not found in /etc/hosts file" + "\n") + f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") f.close() - return topo_node_cluster_dict + return topo_node_cluster_dict, path def etcHostsSame(nodes, path): @@ -320,6 +344,9 @@ def etcHostsSame(nodes, path): for i in range(len(x)): split_str = x[i].split(':') if str in x[i]: + if path is None: + path = createDir() + changeOwner(path) f = open(path+"/etcHostsMD5Sum.txt", "a") f.write(split_str[1] + " not ssh-able at the moment" + "\n") f.close() @@ -327,6 +354,9 @@ def etcHostsSame(nodes, path): else: md5 = split_str[1].lstrip() if md5 != bastion_md5: + if path is None: + path = createDir() + changeOwner(path) f = open(path+"/etcHostsMD5Sum.txt", "a") f.write("/etc/hosts file does not match on " + split_str[0] + "\n") f.close() @@ -335,6 +365,7 @@ def etcHostsSame(nodes, path): print("/etc/hosts on bastion and nodes is different") else: print("/etc/hosts is same on bastion and all nodes that are ssh-able") + return path def ociCommand(metadata, cluster_names): @@ -391,6 +422,22 @@ def gpu_throttle(hostfile, path): out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/gpu_throttle.sh\" ; done > "+path+"/gpu-throttle-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() +def getResizeCluster(args, metadata): + resize_cluster_names = [] + resize_cluster_node_dict = {} + resize_node_cluster_dict = {} + resize_cluster_names = getResizeClusterNames(args.cluster_names) + resize_cluster_names, resize_cluster_node_dict = getResizeNodes(metadata, resize_cluster_names, 1) + + if len(resize_cluster_names) == 0 or len(resize_cluster_node_dict) == 0: + print("There are no clusters available") + else: + for k, v in resize_cluster_node_dict.items(): + for v1 in v: + resize_node_cluster_dict[v1] = k + + return resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict + ############### parser = argparse.ArgumentParser(description = 'Perform these checks. \ @@ -414,28 +461,36 @@ def gpu_throttle(hostfile, path): metadata=get_metadata() -path = createDir() -changeOwner(path) +path = None -if args.num_nodes is not None: - resize_cluster_names = [] - resize_cluster_node_dict = {} - cluster_names = getResizeClusterNames(args.cluster_names) - resize_cluster_names, resize_cluster_node_dict = getResizeNodes(metadata, cluster_names) +resize_cluster_names = [] +resize_cluster_node_dict = {} +resize_node_cluster_dict = {} - if len(resize_cluster_names) == 0 or len(resize_cluster_node_dict) == 0: - print("There are no clusters available") - else: - resize_node_cluster_dict = {} - for k, v in resize_cluster_node_dict.items(): - for v1 in v: - resize_node_cluster_dict[v1] = k +if args.num_nodes is not None: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + + if len(resize_cluster_names) > 0: + + # get all clusters and its corresponding nodes --> this is required to get the cluster name of the nodes from slurm and topology.conf \ + # so as to filter out clusters if -cn option is given + all_cluster_names = [] + all_cluster_node_dict = {} + all_node_cluster_dict = {} + all_cluster_names = getResizeClusterNames(None) + all_cluster_names, all_cluster_node_dict = getResizeNodes(metadata, all_cluster_names, 2) + if len(all_cluster_names) == 0 or len(all_cluster_node_dict) == 0: + print("There are no clusters available") + else: + for k, v in all_cluster_node_dict.items(): + for v1 in v: + all_node_cluster_dict[v1] = k - etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts() + etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts(resize_cluster_names) - slurm_node_cluster_dict, warning_node_dict = slurmGetNodes(etc_node_cluster_dict) + slurm_node_cluster_dict, warning_node_dict, path = slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path) - topo_node_cluster_dict = topologyGetNodes(etc_node_cluster_dict) + topo_node_cluster_dict, path = topologyGetNodes(resize_cluster_names, all_node_cluster_dict, path) inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) @@ -444,59 +499,117 @@ def gpu_throttle(hostfile, path): if resize_node_cluster_dict == etc_node_cluster_dict: print("Number of nodes in /etc/hosts on bastion is same as resize") else: - f = open(path+"/etcHostsNumNodes.txt", "a") + for key in resize_node_cluster_dict.keys(): + if not key in etc_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/etcHostsNumNodes.txt", "a") + f.write(key + " is not in etc hosts" + "\n") + f.close() for key in etc_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/etcHostsNumNodes.txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() if resize_node_cluster_dict == slurm_node_cluster_dict: print("Number of nodes from slurm is same as resize") - else: - f = open(path+"/slurmNumNodes.txt", "a") + else: + for key in resize_node_cluster_dict.keys(): + if not key in slurm_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmNumNodes.txt", "a") + f.write(key + " is not in slurm" + "\n") + f.close() for key in slurm_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmNumNodes.txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() if len(warning_node_dict) > 0: - f = open(path+"/slurmWarnNodes.txt", "a") for key in warning_node_dict.keys(): - f.write(key + " is is slurm state " + warning_node_dict[key] + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmWarnNodes.txt", "a") + f.write(key + " is in slurm state " + warning_node_dict[key] + "\n") + f.close() if resize_node_cluster_dict == topo_node_cluster_dict: print("Number of nodes from topology is same as resize") else: - f = open(path+"/topoNumNodes.txt", "a") + for key in resize_node_cluster_dict.keys(): + if not key in topo_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(key + " is not in topology.conf file" + "\n") + f.close() for key in topo_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() if resize_node_cluster_dict == inventory_node_cluster_dict: print("Number of nodes from inventory is same as resize") else: - f = open(path+"/inventoryNumNodes.txt", "a") + for key in resize_node_cluster_dict.keys(): + if not key in inventory_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/inventoryNumNodes.txt", "a") + f.write(key + " is not in inventory file" + "\n") + f.close() for key in inventory_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/inventoryNumNodes.txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() if resize_node_cluster_dict == oci_node_cluster_dict: print("Number of nodes from oci cli is same as resize") else: - f = open(path+"/ociCliNumNodes.txt", "a") + for key in resize_node_cluster_dict.keys(): + if not key in oci_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/ociCliNumNodes.txt", "a") + f.write(key + " not found using oci cli" + "\n") + f.close() for key in oci_node_cluster_dict.keys(): if not key in resize_node_cluster_dict: - f.write(key + " not in resize list" + "\n") - f.close() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/ociCliNumNodes.txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() node_list = list(map(' '.join, resize_cluster_node_dict.values())) nodes_space = ' '.join(str(s) for s in node_list) split_str = nodes_space.split() nodes_comma = ','.join(str(s) for s in split_str) - etcHostsSame(nodes_comma, path) + path = etcHostsSame(nodes_comma, path) if args.num_nodes is None and args.etc_hosts is not None: hostfile = args.etc_hosts @@ -507,13 +620,58 @@ def gpu_throttle(hostfile, path): nodes_comma = ','.join(str(s) for s in x) etcHostsSame(nodes_comma, path) +hostFileWritten = False if args.pcie_file is not None: - pcie_hostfile = args.pcie_file - pcie_check(pcie_hostfile, path) + if args.pcie_file == 'y' or args.pcie_file == 'Y': + if args.cluster_names is not None: + if len(resize_node_cluster_dict) == 0: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + if len(resize_cluster_names) == 0: + exit() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/host.txt", "a") + for v in resize_node_cluster_dict.keys(): + hostFileWritten = True + f.write(str(v) + "\n") + f.close() + pcie_hostfile = path+"/host.txt" + pcie_check(pcie_hostfile, path) + else: + print("Provide cluster_names file or hosts file to run pcie check") + else: + pcie_hostfile = args.pcie_file + if path is None: + path = createDir() + changeOwner(path) + pcie_check(pcie_hostfile, path) if args.gpu_throttle is not None: - gpu_hostfile = args.gpu_throttle - gpu_throttle(gpu_hostfile, path) + if args.gpu_throttle == 'y' or args.gpu_throttle == 'Y': + if args.cluster_names is not None: + if hostFileWritten is False: + if len(resize_node_cluster_dict) == 0: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + if len(resize_cluster_names) == 0: + exit() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/host.txt", "a") + for v in resize_node_cluster_dict.keys(): + f.write(str(v) + "\n") + f.close() + gpu_hostfile = path+"/host.txt" + gpu_throttle(gpu_hostfile, path) + else: + print("Provide cluster_names file or hosts file to run gpu throttle check") + else: + gpu_hostfile = args.gpu_throttle + if path is None: + path = createDir() + changeOwner(path) + gpu_throttle(gpu_hostfile, path) if path is not None: print(f"Output is in folder: {path}") From 5ee0182e93717f3bb550103b5f429b00abed53e4 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 16 Jan 2023 22:32:30 -0800 Subject: [PATCH 023/133] rectified the grep from /etc/hosts --- bin/num_nodes_same.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/num_nodes_same.py b/bin/num_nodes_same.py index 484218d7..12f2de3e 100644 --- a/bin/num_nodes_same.py +++ b/bin/num_nodes_same.py @@ -214,7 +214,7 @@ def nodesFromEtcHosts(resize_cluster_names): def getConsoleNodeName(slurm_node_name): name = slurm_node_name + ".local.vcn" - out = subprocess.Popen(["cat /etc/hosts | grep "+name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["cat /etc/hosts | grep "+name+" | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() node_name_output = stdout.split("\n") del node_name_output[-1] From 8ad1402e96c9e9b0e6d19e9b4ec622ed5903bdb8 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Tue, 17 Jan 2023 09:24:20 -0800 Subject: [PATCH 024/133] Add PAM --- autoscaling/tf_init/bastion_update.tf | 3 +- autoscaling/tf_init/inventory.tpl | 1 + bastion.tf | 6 ++- conf/variables.tpl | 1 + inventory.tpl | 1 + playbooks/roles/pam/files/sshd | 20 ++++++++ playbooks/roles/pam/tasks/el-7.yml | 70 +++++++++++++++++++++++++++ playbooks/roles/pam/tasks/main.yml | 2 + playbooks/site.yml | 7 ++- schema.yaml | 7 +++ slurm_ha.tf | 2 + variables.tf | 3 +- 12 files changed, 117 insertions(+), 6 deletions(-) create mode 100644 playbooks/roles/pam/files/sshd create mode 100644 playbooks/roles/pam/tasks/el-7.yml create mode 100644 playbooks/roles/pam/tasks/main.yml diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 18fc5889..6e9cb260 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -63,7 +63,8 @@ resource "local_file" "inventory" { privilege_group_name = var.privilege_group_name, latency_check = var.latency_check bastion_username = var.bastion_username, - compute_username = var.compute_username + compute_username = var.compute_username, + pam = var.pam }) filename = "${local.bastion_path}/inventory" } diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 11d848f0..3d2bbcb8 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -62,3 +62,4 @@ privilege_group_name=${privilege_group_name} latency_check=${latency_check} compute_username=${compute_username} bastion_username=${bastion_username} +pam = ${pam} diff --git a/bastion.tf b/bastion.tf index 7ee4ede6..390205dd 100644 --- a/bastion.tf +++ b/bastion.tf @@ -248,7 +248,8 @@ resource "null_resource" "cluster" { pyxis = var.pyxis, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check + latency_check = var.latency_check, + pam = var.pam }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -376,7 +377,8 @@ resource "null_resource" "cluster" { private_deployment = var.private_deployment, use_multiple_ads = var.use_multiple_ads, bastion_username = var.bastion_username, - compute_username = var.compute_username + compute_username = var.compute_username, + pam = var.pam }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/conf/variables.tpl b/conf/variables.tpl index f87fc58c..de1b0233 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -27,6 +27,7 @@ variable "private_subnet" {default = "##PRIVATE_SUBNET##"} variable "slurm" { default = ${slurm} } variable "rack_aware" { default = ${rack_aware} } variable "pyxis" { default = ${pyxis} } +variable "pam" { default = ${pam} } variable "enroot" { default = ${enroot} } variable "slurm_nfs_path" { default = "${slurm_nfs_path}" } variable "spack" { default = ${spack} } diff --git a/inventory.tpl b/inventory.tpl index 8b941154..a0acf028 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -58,6 +58,7 @@ admin_username = ${admin_username} instance_type=permanent enroot=${enroot} pyxis=${pyxis} +pam=${pam} privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} latency_check=${latency_check} diff --git a/playbooks/roles/pam/files/sshd b/playbooks/roles/pam/files/sshd new file mode 100644 index 00000000..186a3bf2 --- /dev/null +++ b/playbooks/roles/pam/files/sshd @@ -0,0 +1,20 @@ +#%PAM-1.0 +auth required pam_nologin.so +auth include password-auth +# Used with polkit to reauthorize users in remote sessions +-auth optional pam_reauthorize.so prepare +account required pam_nologin.so +account include password-auth +password include password-auth +-account required pam_slurm_adopt.so +# pam_selinux.so close should be the first session rule +session required pam_selinux.so close +session required pam_loginuid.so +# pam_selinux.so open should only be followed by sessions to be executed in the user context +session required pam_selinux.so open env_params +session required pam_namespace.so +session optional pam_keyinit.so force revoke +session include password-auth +session include postlogin +# Used with polkit to reauthorize users in remote sessions +-session optional pam_reauthorize.so prepare diff --git a/playbooks/roles/pam/tasks/el-7.yml b/playbooks/roles/pam/tasks/el-7.yml new file mode 100644 index 00000000..609194ab --- /dev/null +++ b/playbooks/roles/pam/tasks/el-7.yml @@ -0,0 +1,70 @@ +--- + +- name: Edit /etc/security/access.conf + become: true + blockinfile: + dest: /etc/security/access.conf + block: | + +:root:ALL + +:wheel:ALL + +:opc:ALL + -:ALL:ALL + +- name: Copy sshd file + become: true + copy: + src: sshd + dest: /etc/pam.d/sshd + +- name: Edit slurm.conf to add cgroup to TaskPlugin + become: true + lineinfile: + path: /etc/slurm/slurm.conf + regexp: "TaskPlugin=task/affinity" + line: "TaskPlugin=task/affinity,task/cgroup" + state: present + when: ('bastion' in group_names ) + +- name: Edit slurm.conf to add the PrologFlag + become: true + lineinfile: + path: /etc/slurm/slurm.conf + line: "PrologFlags=contain" + state: present + when: ('bastion' in group_names ) + +- name: Stop logind + systemd: + name: systemd-logind + state: stopped + enabled: no + masked: yes + +- name: restart slurm server + become: true + service: + name: '{{ item }}' + state: restarted + enabled: true + with_items: + - slurmdbd + - slurmctld + register: result + until: result is not failed + retries: 5 + delay: 5 + when: ('bastion' in group_names ) + +- name: restart slurm + become: true + service: + name: '{{ item }}' + state: restarted + enabled: true + with_items: + - slurmd + register: result + until: result is not failed + retries: 5 + delay: 5 + when: ('compute' in group_names ) diff --git a/playbooks/roles/pam/tasks/main.yml b/playbooks/roles/pam/tasks/main.yml new file mode 100644 index 00000000..aa9f7626 --- /dev/null +++ b/playbooks/roles/pam/tasks/main.yml @@ -0,0 +1,2 @@ +- include: el-7.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index 746adf8f..b46eb947 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -292,4 +292,9 @@ name: slurm when: slurm|default(false)|bool - +- hosts: all + become: true + tasks: + - include_role: + name: pam + when: pam|default(false)|bool diff --git a/schema.yaml b/schema.yaml index 7ad5400b..0a2bd3b8 100755 --- a/schema.yaml +++ b/schema.yaml @@ -139,6 +139,7 @@ variableGroups: - ${spack} - ${monitoring} - ${enroot} + - ${pam} - title: "Hidden" variables: @@ -958,6 +959,12 @@ variables: default: false description: "Install Enroot, Nvidia Container Toolkit, and docker." + pam: + type: boolean + title: "Enable PAM" + default: false + description: "Enable PAM for the Slurm cluster. When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job in Slurm." + monitoring: type: boolean title: "Install HPC Cluster Monitoring Tools" diff --git a/slurm_ha.tf b/slurm_ha.tf index 2a63c274..89d0c21e 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -240,6 +240,7 @@ resource "null_resource" "cluster_backup" { admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", enroot = var.enroot, pyxis = var.pyxis, + pam = var.pam, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check @@ -364,6 +365,7 @@ resource "null_resource" "cluster_backup" { autoscaling_monitoring = var.autoscaling_monitoring, enroot = var.enroot, pyxis = var.pyxis, + pam = var.pam, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, diff --git a/variables.tf b/variables.tf index 27606904..5abbfae5 100755 --- a/variables.tf +++ b/variables.tf @@ -134,8 +134,7 @@ variable "nfs_options" {default = ""} variable "monitoring" { default = true } variable "enroot" { default = false } variable "pyxis" { default = false } - - +variable "pam" { default = false } variable "unsupported" { type=bool From fc07edc32948715d59a109296a5ef1ee296d45db Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 11:52:32 -0700 Subject: [PATCH 025/133] Fix the scratch NFS option on autoscaling clusters --- bastion.tf | 1 + conf/variables.tpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bastion.tf b/bastion.tf index 6cc9379e..2d366c92 100644 --- a/bastion.tf +++ b/bastion.tf @@ -333,6 +333,7 @@ resource "null_resource" "cluster" { nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, slurm = var.slurm, rack_aware = var.rack_aware, slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path diff --git a/conf/variables.tpl b/conf/variables.tpl index f87fc58c..71e695a1 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -13,7 +13,7 @@ variable "boot_volume_size" {default = "##BOOT##"} variable "use_marketplace_image" { default = "##USEMP##" } variable "use_old_marketplace_image" { default = "##USEOLDMP##" } variable "scratch_nfs_path" { default = "${scratch_nfs_path}" } -variable "use_scratch_nfs" { default = true } +variable "use_scratch_nfs" { default = ${use_scratch_nfs} } variable "cluster_nfs_path" {default = "${cluster_nfs_path}"} variable "use_cluster_nfs" { default = ${use_cluster_nfs} } variable "image" { default = "##IMAGE##" } From 8bb95954627085fa60ecb73500a1a00b6074a1e1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 11:53:38 -0700 Subject: [PATCH 026/133] Remove the RDMA address from /etc/hosts on remove --- playbooks/roles/destroy_unreachable/tasks/common.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/playbooks/roles/destroy_unreachable/tasks/common.yml b/playbooks/roles/destroy_unreachable/tasks/common.yml index 5ec25b83..111778da 100644 --- a/playbooks/roles/destroy_unreachable/tasks/common.yml +++ b/playbooks/roles/destroy_unreachable/tasks/common.yml @@ -46,6 +46,15 @@ with_items: "{{all_unreachable_nodes}}" ignore_unreachable: yes +- name: remove from /etc/hosts + become: true + lineinfile: + path: "/etc/hosts" + regexp: "{{item}}-rdma\\s" + state: absent + with_items: "{{all_unreachable_nodes}}" + ignore_unreachable: yes + - name: "remove from hostfile.rdma.{{ cluster_name }}" lineinfile: path: "/etc/opt/oci-hpc/hostfile.rdma.{{ cluster_name }}" From 8e296b5efd6ff959f65d047ad86c1b624c06279f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 11:54:03 -0700 Subject: [PATCH 027/133] Fix Slurm topology when using Cross Block --- .../roles/destroy_unreachable/tasks/slurm-rack-aware.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index 485f865d..822745e8 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -47,7 +47,7 @@ when: ( item.stdout_lines | length ) == 0 - name: get UpperSwitchNames - shell: "scontrol show topology {{item}} | grep -v inactive | grep Switches= | awk '{print $1}' | cut -d \"=\" -f 2" + shell: "scontrol show topology {{item}} | grep -v inactive | grep Switches= | grep Level=1 | awk '{print $1}' | cut -d \"=\" -f 2" register: current_UpperSwitchName run_once: true delegate_to: 127.0.0.1 @@ -128,7 +128,7 @@ state: present with_items: "{{unreachable_slurm_nodes}}" ignore_errors: yes - when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 + when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2 run_once: true delegate_to: 127.0.0.1 From 6c5c8d42399b3fe60b20ab3c0c55751a0005c086 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 11:54:33 -0700 Subject: [PATCH 028/133] Fix size in slurm in case of different subnets --- playbooks/roles/slurm/templates/slurm.conf.j2 | 25 ++++++++++++++----- .../roles/slurm/templates/topology.conf.j2 | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 9b32b137..c4a6a83b 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -45,9 +45,9 @@ TopologyPlugin=topology/tree TreeWidth=2048 SlurmctldParameters=enable_configless -{% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} {% for partition in queues %} {% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} {% if instance.hyperthreading | bool %} {% set threadspercore = 2 %} {% else %} @@ -113,10 +113,23 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar {% for partition in queues %} {% if partition.default %} -PartitionName={{partition.name}} Nodes={% for instance in partition.instance_types -%} - {{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}],{%- endfor %} Default=YES MaxTime=INFINITE State=UP +{% set nodesList = [] %} +{% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} +{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{%- endfor %} +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO MaxTime=INFINITE State=UP {% else %} -PartitionName={{partition.name}} Nodes={% for instance in partition.instance_types -%} - {{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}],{%- endfor %} Default=NO MaxTime=INFINITE State=UP +{% set nodesList = [] %} +{% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} +{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{%- endfor %} +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=Yes MaxTime=INFINITE State=UP {% endif %} -{% endfor %} \ No newline at end of file +{% endfor %} + +PartitionName=compute Nodes=adept-blk27-node-[1-4096],adept-blk29-node-[1-4096],adept-blk34-node-[1-4096],adept-blk36-node-[1-4096],adept-blk39-node-[1-4096],adept-blk40-node-[1-4096] Default=YES MaxTime=INFINITE State=UP +PartitionName=A100-40GB Nodes=adept-blk8-node-[1-4096] Default=NO MaxTime=INFINITE State=UP +PartitionName=cpu-dp Nodes=adept-cpudp-node-[1-2048] Default=NO MaxTime=INFINITE State=UP +PartitionName=cpu-dev Nodes=adept-cpudev-node-[1-4096] Default=NO MaxTime=INFINITE State=UP \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/topology.conf.j2 b/playbooks/roles/slurm/templates/topology.conf.j2 index 12bb02d2..7da9d362 100644 --- a/playbooks/roles/slurm/templates/topology.conf.j2 +++ b/playbooks/roles/slurm/templates/topology.conf.j2 @@ -1,7 +1,7 @@ ### Topology File -{% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} {% for partition in queues %} {% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} SwitchName=inactive-{{partition.name}}-{{instance.instance_keyword}} Nodes={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] {% endfor %} {% endfor %} \ No newline at end of file From c6e7ef2d19af01416b42ba864cc8cc887f64b6f6 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 11:56:34 -0700 Subject: [PATCH 029/133] Flush handler to make sure slurm is reconfigured --- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 5bc86dd1..6afbca24 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -234,6 +234,9 @@ delegate_to: 127.0.0.1 when: racks_left_list | length > 0 +- name: run handlers + meta: flush_handlers + - name: start slurmd become: true service: From df03448a1e52c18a7c88417857b31d012d230584 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Tue, 17 Jan 2023 11:58:14 -0800 Subject: [PATCH 030/133] Fix indent --- playbooks/site.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/site.yml b/playbooks/site.yml index b46eb947..0ae44a5d 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -297,4 +297,4 @@ tasks: - include_role: name: pam - when: pam|default(false)|bool + when: pam|default(false)|bool From e63528708cd22bd5d22ef32a3fdabb5186114df5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 14:16:09 -0700 Subject: [PATCH 031/133] Fix issue with simultaneous resize --- bin/resize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/resize.py b/bin/resize.py index 576d9298..b21ca6de 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -339,21 +339,21 @@ def getreachable(instances,username,delay=0): reachable_ips=[] for i in delays: - input_file=open('/tmp/input_hosts_to_check','w') + input_file=open('/tmp/input_hosts_to_check_'+cluster_name,'w') for node in instances: if not node['ip'] in reachable_ips: input_file.write(node['ip']+"\n") input_file.close() my_env = os.environ.copy() my_env["ANSIBLE_HOST_KEY_CHECKING"] = "False" - p = subprocess.Popen(["/opt/oci-hpc/bin/find_reachable_hosts.sh","/tmp/input_hosts_to_check","/tmp/reachable_hosts",username,"0"],env=my_env,stderr = subprocess.PIPE, stdout=subprocess.PIPE) + p = subprocess.Popen(["/opt/oci-hpc/bin/find_reachable_hosts.sh","/tmp/input_hosts_to_check_"+cluster_name,"/tmp/reachable_hosts_"+cluster_name,username,"0"],env=my_env,stderr = subprocess.PIPE, stdout=subprocess.PIPE) while True: output = p.stdout.readline().decode() if output == '' and p.poll() is not None: break if output: print(output.strip()) - output_file=open('/tmp/reachable_hosts','r') + output_file=open('/tmp/reachable_hosts_'+cluster_name,'r') for line in output_file: reachable_ips.append(line.strip()) output_file.close() From c47ad79f3699d76f1a7ce1981ec429ddfe5a8586 Mon Sep 17 00:00:00 2001 From: Oguz Pastirmaci Date: Tue, 17 Jan 2023 15:16:24 -0800 Subject: [PATCH 032/133] Missing comma --- bastion.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bastion.tf b/bastion.tf index 90531653..5565aabc 100644 --- a/bastion.tf +++ b/bastion.tf @@ -249,7 +249,7 @@ resource "null_resource" "cluster" { privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, - pam = var.pam + pam = var.pam, inst_prin = var.inst_prin, region = var.region, tenancy_ocid = var.tenancy_ocid, From f042220febe536a58756e02afe4a9ed943f3974b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 18:47:35 -0700 Subject: [PATCH 033/133] Add a Slurm login node --- autoscaling/tf_init/bastion_update.tf | 2 + autoscaling/tf_init/inventory.tpl | 2 + bastion.tf | 4 + conf/variables.tpl | 2 + inventory.tpl | 2 + locals.tf | 6 + login.tf | 59 +++++ marketplace.tf | 30 +++ .../etc-hosts/templates/etc-hosts-bastion.j2 | 4 + playbooks/roles/slurm/tasks/el7.yml | 6 + playbooks/roles/slurm/tasks/el8.yml | 6 + playbooks/roles/slurm/tasks/login.yml | 86 +++++++ playbooks/roles/slurm/tasks/ubuntu.yml | 4 + playbooks/roles/slurm/templates/slurm.conf.j2 | 11 +- playbooks/roles/slurm/vars/centos_vars.yml | 5 + playbooks/roles/slurm/vars/el_vars.yml | 5 + playbooks/roles/slurm/vars/ubuntu_vars.yml | 3 + playbooks/site.yml | 13 +- schema.yaml | 232 +++++++++++++++++- slurm_ha.tf | 7 +- variables.tf | 45 +++- 21 files changed, 521 insertions(+), 13 deletions(-) create mode 100644 login.tf create mode 100755 playbooks/roles/slurm/tasks/login.yml diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 4f775e06..43f48d9d 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -22,6 +22,8 @@ resource "local_file" "inventory" { bastion_ip = var.bastion_ip, backup_name = var.backup_name, backup_ip = var.backup_ip, + login_name = var.login_name, + login_ip = var.login_ip, compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index fcc97be2..da0aea71 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -2,6 +2,8 @@ ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${bastion_username} role=bastion%{ endif } +[login] +%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} diff --git a/bastion.tf b/bastion.tf index e6173cea..411b651e 100644 --- a/bastion.tf +++ b/bastion.tf @@ -201,6 +201,8 @@ resource "null_resource" "cluster" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, @@ -330,6 +332,8 @@ resource "null_resource" "cluster" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.bastion_subnet_id, diff --git a/conf/variables.tpl b/conf/variables.tpl index 71e695a1..d69d25bb 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -82,6 +82,8 @@ variable "bastion_name" {default = "${bastion_name}"} variable "bastion_ip" {default = "${bastion_ip}"} variable "backup_name" {default = "${backup_name}"} variable "backup_ip" {default = "${backup_ip}"} +variable "login_name" {default = "${login_name}"} +variable "login_ip" {default = "${login_ip}"} variable "scripts_folder" {default = "/opt/oci-hpc/bin/"} variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"} variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"} diff --git a/inventory.tpl b/inventory.tpl index 5ddb3c32..197fd94f 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -2,6 +2,8 @@ ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=bastion%{ endif } +[login] +%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} diff --git a/locals.tf b/locals.tf index 016f1800..99ca6466 100755 --- a/locals.tf +++ b/locals.tf @@ -5,6 +5,7 @@ locals { image_ocid = var.unsupported ? var.image_ocid : var.image custom_bastion_image_ocid = var.unsupported_bastion ? var.unsupported_bastion_image : var.custom_bastion_image + custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -25,6 +26,8 @@ locals { bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid + login_image = var.use_standard_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid @@ -32,6 +35,8 @@ locals { // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] + is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [var.login_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" @@ -48,6 +53,7 @@ locals { cluster_ocid = var.node_count > 0 ? var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.bastion.public_ip bastion_bool_ip = var.private_deployment ? false : true + login_bool_ip = var.private_deployment ? false : true bastion_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.private-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" diff --git a/login.tf b/login.tf new file mode 100644 index 00000000..22200fc5 --- /dev/null +++ b/login.tf @@ -0,0 +1,59 @@ +resource "oci_core_volume" "login_volume" { + count = var.login_block && var.login_node ? 1 : 0 + availability_domain = var.login_ad + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-login" + size_in_gbs = var.login_block_volume_size + vpus_per_gb = split(".", var.login_block_volume_performance)[0] +} + + +resource "oci_core_volume_attachment" "login_volume_attachment" { + count = var.login_block && var.login_node ? 1 : 0 + attachment_type = "iscsi" + volume_id = oci_core_volume.login_volume[0].id + instance_id = oci_core_instance.login[0].id + display_name = "${local.cluster_name}-login-volume-attachment" + device = "/dev/oracleoci/oraclevdb" +} + +resource "oci_core_instance" "login" { + count = var.login_node ? 1 : 0 + depends_on = [oci_core_subnet.public-subnet] + availability_domain = var.login_ad + compartment_id = var.targetCompartment + shape = var.login_shape + + dynamic "shape_config" { + for_each = local.is_login_flex_shape + content { + ocpus = shape_config.value + memory_in_gbs = var.login_custom_memory ? var.login_memory : 16 * shape_config.value + } + } + agent_config { + is_management_disabled = true + } + display_name = "${local.cluster_name}-login" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } + + metadata = { + ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" + user_data = base64encode(data.template_file.bastion_config.rendered) + } + source_details { +// source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_bastion_image_ocid + source_id = local.login_image + boot_volume_size_in_gbs = var.login_boot_volume_size + source_type = "image" + } + + create_vnic_details { + subnet_id = local.bastion_subnet_id + assign_public_ip = local.login_bool_ip + } +} diff --git a/marketplace.tf b/marketplace.tf index 3aee746f..ddc5140f 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -2,8 +2,10 @@ locals { // listing_number = split(".", var.marketplace_listing)[0] mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_bastion_listing_id = var.use_standard_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" + mp_login_listing_id = var.use_standard_image_login ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing] mp_bastion_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] + mp_login_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] } /* @@ -73,3 +75,31 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { create = "20m" } } + +data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { + count = var.use_standard_image_login ? 1 : 0 + listing_id = local.mp_login_listing_id +} + +resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { + count = var.use_standard_image_login ? 1 : 0 + + listing_id = local.mp_login_listing_id + listing_resource_version = local.mp_login_version_id + +} + +resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { + count = var.use_standard_image_login ? 1 : 0 + compartment_id = var.targetCompartment + eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link + listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id + listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_resource_version + oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].oracle_terms_of_use_link + signature = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].signature + time_retrieved = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].time_retrieved + + timeouts { + create = "20m" + } +} diff --git a/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 b/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 index f5fdd943..0289b5c0 100755 --- a/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 +++ b/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 @@ -5,4 +5,8 @@ {% for item in groups['slurm_backup'] %} {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} backup +{% endfor %} +{% for item in groups['login'] %} +{% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} +{{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index ad8ecca8..cd49e199 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -31,6 +31,12 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + vars: + slurm_repos: "epel,ol7_developer_EPEL" + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index 76bf75dd..5ce029b1 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -31,6 +31,12 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) diff --git a/playbooks/roles/slurm/tasks/login.yml b/playbooks/roles/slurm/tasks/login.yml new file mode 100755 index 00000000..2a6c74ab --- /dev/null +++ b/playbooks/roles/slurm/tasks/login.yml @@ -0,0 +1,86 @@ +--- +- name: install SLURM login packages + vars: + package_name: '{{ slurm_login_packages }}' + package_repo: "{{ slurm_repos }}" + disable_gpg_check_var: True + include_role: + name: safe_yum + +- name: Render systemd units for slurm, slurmdbd and munge + become: true + template: + src: 'systemd/{{ item }}.service' + dest: '/lib/systemd/system/{{ item }}.service' + backup: "yes" + with_items: + - slurmd + when: ansible_os_family == 'Debian' + +- name: Create systemd unit dirs + become: true + file: + name: '/etc/systemd/system/{{ item }}.service.d' + state: directory + with_items: + - munge + - slurmd + +- name: Render systemd units for slurmd and munge + become: true + template: + src: 'systemd/{{ item }}.service.d/unit.conf.j2' + dest: '/etc/systemd/system/{{ item }}.service.d/unit.conf' + backup: "yes" + with_items: + - munge + - slurmd + +- name: Create munge dir + become: true + file: + name: '{{ munge_conf_path }}' + state: directory + owner: munge + group: munge + mode: 0700 + +- name: copy munge.key to tmp + become: true + shell: + cmd: cp /etc/munge/munge.key /tmp/munge.key + warn: false + delegate_to: 127.0.0.1 + run_once: true + +- name: set permissions + become: true + shell: + cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key + warn: false + delegate_to: 127.0.0.1 + run_once: true + +- name: Copy munge key + become: true + copy: + src: /tmp/munge.key + dest: /etc/munge/munge.key + owner: munge + group: munge + mode: '0400' + notify: restart munge + +- name: restart munge + become: true + service: + name: munge + state: restarted + enabled: true + +- name: start slurmd + become: true + service: + name: slurmd + state: restarted + enabled: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index 57ccbbf0..b54ae961 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -14,6 +14,10 @@ include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index c4a6a83b..b805ecbb 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -118,18 +118,13 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar {% set size = instance.private_subnet | ipaddr('size')%} {{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} {%- endfor %} -PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO MaxTime=INFINITE State=UP +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=YES MaxTime=INFINITE State=UP {% else %} {% set nodesList = [] %} {% for instance in partition.instance_types %} {% set size = instance.private_subnet | ipaddr('size')%} {{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} {%- endfor %} -PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=Yes MaxTime=INFINITE State=UP +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO MaxTime=INFINITE State=UP {% endif %} -{% endfor %} - -PartitionName=compute Nodes=adept-blk27-node-[1-4096],adept-blk29-node-[1-4096],adept-blk34-node-[1-4096],adept-blk36-node-[1-4096],adept-blk39-node-[1-4096],adept-blk40-node-[1-4096] Default=YES MaxTime=INFINITE State=UP -PartitionName=A100-40GB Nodes=adept-blk8-node-[1-4096] Default=NO MaxTime=INFINITE State=UP -PartitionName=cpu-dp Nodes=adept-cpudp-node-[1-2048] Default=NO MaxTime=INFINITE State=UP -PartitionName=cpu-dev Nodes=adept-cpudev-node-[1-4096] Default=NO MaxTime=INFINITE State=UP \ No newline at end of file +{% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/centos_vars.yml b/playbooks/roles/slurm/vars/centos_vars.yml index d37fe9ad..7498933c 100644 --- a/playbooks/roles/slurm/vars/centos_vars.yml +++ b/playbooks/roles/slurm/vars/centos_vars.yml @@ -44,6 +44,11 @@ slurm_compute_packages: slurm_backup_server_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-slurmctld-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" + +slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/el_vars.yml b/playbooks/roles/slurm/vars/el_vars.yml index a72d3681..cabe9ef0 100644 --- a/playbooks/roles/slurm/vars/el_vars.yml +++ b/playbooks/roles/slurm/vars/el_vars.yml @@ -43,6 +43,11 @@ slurm_compute_packages: slurm_backup_server_packages: - "{{ download_path }}/slurm_rpms/slurm-slurmctld-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + +slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/ubuntu_vars.yml b/playbooks/roles/slurm/vars/ubuntu_vars.yml index e313d4da..c820e9b8 100644 --- a/playbooks/roles/slurm/vars/ubuntu_vars.yml +++ b/playbooks/roles/slurm/vars/ubuntu_vars.yml @@ -20,4 +20,7 @@ slurm_compute_packages: - libpmi0 slurm_backup_server_packages: + - libpmi0 + +slurm_login_packages: - libpmi0 \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index 60358f80..650f2d71 100755 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -120,7 +120,7 @@ -- hosts: compute +- hosts: compute, login become: true tasks: - include_role: @@ -175,6 +175,17 @@ name: iscsi when: bastion_block|default(false)|bool +- hosts: login + become: true + vars: + iscsi_ip: "{{ bastion_mount_ip }}" + tasks: + - include_role: + name: passwords + - include_role: + name: iscsi + when: login_block|default(false)|bool + - hosts: nfs become: true vars: diff --git a/schema.yaml b/schema.yaml index 3c207916..d33018d1 100755 --- a/schema.yaml +++ b/schema.yaml @@ -57,6 +57,24 @@ variableGroups: - ${compute_image_compartment} - ${image} - ${image_ocid} + - title: "Additionnal Login Node" + variables: + - ${login_node} + - ${login_ad} + - ${login_shape} + - ${login_ocpus} + - ${login_custom_memory} + - ${login_memory} + - ${login_boot_volume_size} + - ${use_standard_image_login} + - ${unsupported_login} + - ${login_image_compartment} + - ${custom_login_image} + - ${unsupported_login_image} + - ${login_username} + - ${login_block} + - ${login_block_volume_size} + - ${login_block_volume_performance} - title: Autoscaling variables: - ${autoscaling} @@ -1160,4 +1178,216 @@ variables: required: true visible: and: - - ${privilege_sudo} \ No newline at end of file + - ${privilege_sudo} + + + + login_node: + type: boolean + title: "Login Node" + default: true + description: "Create an additional login node for users" + + login_ad: + type: oci:identity:availabilitydomain:name + dependsOn: + compartmentId: ${targetCompartment} + visible: + and: + - complexExpression + - ${login_node} + required: true + description: "Availability Domain for login node" + title: "Availability Domain For Login Node" + #default: ${ad} + + login_shape: + type: oci:core:instanceshape:name + dependsOn: + compartmentId: ${targetCompartment} + required: true + default: VM.Standard2.4 + visible: ${login_node} + + login_ocpus: + type: integer + description: Number of OCPU's for flex shape + minimum: 1 + maximum: 64 + default: 2 + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - ${login_node} + required: true + + login_custom_memory: + title: Use custom memory size + type: boolean + default: false + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - ${login_node} + login_memory: + title: Memory in GBS + type: integer + description: Number of memory for flex shape. Minimum 1GB per core. + minimum: 1 + maximum: 1024 + default: 16 + visible: + and: + - and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - and: + - ${login_custom_memory} + - ${login_node} + required: true + + login_boot_volume_size: + type: integer + required: true + minimum: 50 + title: "Size of the boot volume in GB" + default: 50 + visible: ${login_node} + + login_block: + type: boolean + title: Additional block volume for shared space + default: false + visible: ${login_node} + + login_block_volume_size: + required: true + type: integer + title: "Size of the additional volume in GB" + default: 1000 + visible: + and: + - and: + - ${login_block} + - ${login_node} + login_block_volume_performance: + type: enum + title: "Block volume performance" + required: true + enum: + - "0. Lower performance" + - "10. Balanced performance" + - "20. High Performance" + default: "10. Balanced performance" + visible: + and: + - and: + - ${login_block} + - ${login_node} + use_standard_image_login: + type: boolean + title: "use standard login image" + description: > + "Use standard login image (Oracle Linux)" + default: true + visible: ${login_node} + + unsupported_login: + title: "Use unsupported image" + description: "Custom image ID for Login Node" + type: boolean + default: false + visible: + not: + - ${use_standard_image_login} + + login_image_compartment: + title: "login image compartment" + type: oci:identity:compartment:id + default: ${targetCompartment} + visible: + and: + - not: + - ${use_standard_image_login} + - not: + - ${unsupported_login} + required: true + + custom_login_image: + title: "Login Image ID" + description: "Custom image ID for login nodes. Please note that only Oracle Linux and Ubuntu 20.04 are supported as login image at this moment. " + type: oci:core:image:id + dependsOn: + compartmentId: ${login_image_compartment} + visible: + and: + - not: + - ${use_standard_image_login} + - not: + - ${unsupported_login} + required: true + unsupported_login_image: + title: "Image OCID" + description: "Custom image ID for login nodes" + type: string + required: true + visible: + and: + - ${unsupported_login} + - not: + - ${use_standard_image_login} + default: "image.ocid" + + login_username: + title: "Default username for login node" + description: "Custom image ID for login node" + type: string + default: "opc" + required: true + visible: + not: + - ${use_standard_image_login} \ No newline at end of file diff --git a/slurm_ha.tf b/slurm_ha.tf index 5d74d7ac..2abd9f1d 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -195,6 +195,8 @@ resource "null_resource" "cluster_backup" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, @@ -324,14 +326,17 @@ resource "null_resource" "cluster_backup" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.bastion_subnet_id, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, slurm = var.slurm, slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path rack_aware = var.rack_aware, diff --git a/variables.tf b/variables.tf index 27606904..1ec4678a 100755 --- a/variables.tf +++ b/variables.tf @@ -12,10 +12,15 @@ variable "cluster_name" { default = "" } variable "bastion_ad" {} variable "bastion_shape" { default = "VM.Standard2.4" } variable "use_standard_image" { default= true } +variable "use_standard_image_login" { default= true } variable "custom_bastion_image" { type = string default = "image.ocid" } +variable "custom_login_image" { + type = string + default = "image.ocid" +} variable "bastion_boot_volume_size" {} variable "cluster_network_shape" { default = "BM.HPC2.36" } variable "instance_pool_shape" { default = "VM.Standard2.4" } @@ -26,6 +31,7 @@ variable "use_old_marketplace_image" { default = false} variable "image" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "unsupported_bastion_image" { default = "" } +variable "unsupported_login_image" { default = "" } variable "use_cluster_nfs" { default = true} variable "use_scratch_nfs" { default = true } variable "cluster_nfs_path" { default = "/nfs/cluster" } @@ -43,6 +49,10 @@ variable "private_subnet" { default = "172.16.4.0/22" } variable "ssh_cidr" { default = "0.0.0.0/0" } variable "slurm" { default = false } variable "slurm_ha" { default = false } +variable "login_node" { default = false } +variable "login_ad" {} +variable "login_shape" { default = "VM.Standard2.4" } +variable "login_boot_volume_size" {} variable "slurm_nfs" { default = false } variable "rack_aware" { default = false } variable "ldap" { default = true } @@ -51,8 +61,11 @@ variable "bastion_ocpus" { default = 2} variable "instance_pool_ocpus" { default = 2} variable "instance_pool_memory" { default = 16 } variable "instance_pool_custom_memory" { default = false } +variable "login_ocpus" { default = 2} variable "bastion_memory" { default = 16 } variable "bastion_custom_memory" { default = false } +variable "login_memory" { default = 16 } +variable "login_custom_memory" { default = false } variable "privilege_sudo" { default = true } variable "privilege_group_name" { default = "privilege" } @@ -106,6 +119,26 @@ variable "bastion_block" { variable "bastion_block_volume_size" { default = 1000 } + +variable "login_block_volume_performance" { +/* + Allowed values + "0. Lower performance" + "10. Balanced performance" + "20. High Performance" +*/ + +default = "10. Balanced performance" + +} + +variable "login_block" { + default = false +} + +variable "login_block_volume_size" { + default = 1000 +} variable "scratch_nfs_type_cluster" { default = "nvme"} variable "scratch_nfs_type_pool" { default = "none" } variable "cluster_block_volume_size" { default = "1000" } @@ -147,7 +180,10 @@ variable "unsupported_bastion" { type=bool default = false } - +variable "unsupported_login" { + type=bool + default = false +} variable "bastion_username" { type = string default = "opc" @@ -157,6 +193,10 @@ variable "compute_username" { type = string default = "opc" } +variable "login_username" { + type = string + default = "opc" +} variable "autoscaling_monitoring" { type= bool @@ -190,4 +230,5 @@ variable cluster_nfs_export {default = ""} variable "private_deployment" { default = false } -variable "localdisk" { default = true } \ No newline at end of file +variable "localdisk" { default = true } + From 8c101e6d11a69c8dff855bc7b07a7feef9b15f01 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 21:54:17 -0700 Subject: [PATCH 034/133] Add PAM to the Slurm role --- playbooks/roles/pam/tasks/el-7.yml | 70 ------------------- playbooks/roles/pam/tasks/main.yml | 2 - playbooks/roles/{pam => slurm}/files/sshd | 0 playbooks/roles/slurm/tasks/common.yml | 6 +- playbooks/roles/slurm/tasks/common_pam.yml | 23 ++++++ playbooks/roles/slurm/templates/slurm.conf.j2 | 3 +- playbooks/site.yml | 7 -- 7 files changed, 30 insertions(+), 81 deletions(-) delete mode 100644 playbooks/roles/pam/tasks/el-7.yml delete mode 100644 playbooks/roles/pam/tasks/main.yml rename playbooks/roles/{pam => slurm}/files/sshd (100%) create mode 100644 playbooks/roles/slurm/tasks/common_pam.yml diff --git a/playbooks/roles/pam/tasks/el-7.yml b/playbooks/roles/pam/tasks/el-7.yml deleted file mode 100644 index 609194ab..00000000 --- a/playbooks/roles/pam/tasks/el-7.yml +++ /dev/null @@ -1,70 +0,0 @@ ---- - -- name: Edit /etc/security/access.conf - become: true - blockinfile: - dest: /etc/security/access.conf - block: | - +:root:ALL - +:wheel:ALL - +:opc:ALL - -:ALL:ALL - -- name: Copy sshd file - become: true - copy: - src: sshd - dest: /etc/pam.d/sshd - -- name: Edit slurm.conf to add cgroup to TaskPlugin - become: true - lineinfile: - path: /etc/slurm/slurm.conf - regexp: "TaskPlugin=task/affinity" - line: "TaskPlugin=task/affinity,task/cgroup" - state: present - when: ('bastion' in group_names ) - -- name: Edit slurm.conf to add the PrologFlag - become: true - lineinfile: - path: /etc/slurm/slurm.conf - line: "PrologFlags=contain" - state: present - when: ('bastion' in group_names ) - -- name: Stop logind - systemd: - name: systemd-logind - state: stopped - enabled: no - masked: yes - -- name: restart slurm server - become: true - service: - name: '{{ item }}' - state: restarted - enabled: true - with_items: - - slurmdbd - - slurmctld - register: result - until: result is not failed - retries: 5 - delay: 5 - when: ('bastion' in group_names ) - -- name: restart slurm - become: true - service: - name: '{{ item }}' - state: restarted - enabled: true - with_items: - - slurmd - register: result - until: result is not failed - retries: 5 - delay: 5 - when: ('compute' in group_names ) diff --git a/playbooks/roles/pam/tasks/main.yml b/playbooks/roles/pam/tasks/main.yml deleted file mode 100644 index aa9f7626..00000000 --- a/playbooks/roles/pam/tasks/main.yml +++ /dev/null @@ -1,2 +0,0 @@ -- include: el-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' \ No newline at end of file diff --git a/playbooks/roles/pam/files/sshd b/playbooks/roles/slurm/files/sshd similarity index 100% rename from playbooks/roles/pam/files/sshd rename to playbooks/roles/slurm/files/sshd diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 4519a958..1f62992a 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -160,4 +160,8 @@ - name: Include pyxis prolog files include: common_pyxis.yml - when: pyxis|bool \ No newline at end of file + when: pyxis|bool + +- name: Run Pam settings + include: common_pam.yml + when: pam|bool \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/common_pam.yml b/playbooks/roles/slurm/tasks/common_pam.yml new file mode 100644 index 00000000..70b88f36 --- /dev/null +++ b/playbooks/roles/slurm/tasks/common_pam.yml @@ -0,0 +1,23 @@ +--- +- name: Edit /etc/security/access.conf + become: true + blockinfile: + dest: /etc/security/access.conf + block: | + +:root:ALL + +:wheel:ALL + +:opc:ALL + -:ALL:ALL + +- name: Copy sshd file + become: true + copy: + src: sshd + dest: /etc/pam.d/sshd + +- name: Stop logind + systemd: + name: systemd-logind + state: stopped + enabled: no + masked: yes diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index b805ecbb..ac7a3747 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -19,7 +19,8 @@ SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log StateSaveLocation={{ slurm_nfs_path }}/spool/slurm SwitchType=switch/none -TaskPlugin=task/affinity +TaskPlugin=task/affinity{% if pam |bool %},task/cgroup{% endif %} +{% if pam |bool %}PrologFlags=contain{% endif %} InactiveLimit=0 KillWait=30 MinJobAge=300 diff --git a/playbooks/site.yml b/playbooks/site.yml index 0a8923f3..976c047b 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -314,10 +314,3 @@ - include_role: name: slurm when: slurm|default(false)|bool - -- hosts: all - become: true - tasks: - - include_role: - name: pam - when: pam|default(false)|bool From 286554c20395e777951c166772c28a1e9b3c1d3b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 22:15:59 -0700 Subject: [PATCH 035/133] Missing newline in slurm.conf --- playbooks/roles/slurm/templates/slurm.conf.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index ac7a3747..52759027 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -20,6 +20,7 @@ SlurmdLogFile=/var/log/slurm/slurmd.log StateSaveLocation={{ slurm_nfs_path }}/spool/slurm SwitchType=switch/none TaskPlugin=task/affinity{% if pam |bool %},task/cgroup{% endif %} + {% if pam |bool %}PrologFlags=contain{% endif %} InactiveLimit=0 KillWait=30 From dbc39b464c75e223c5940ab848184aa1cbd78776 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 17 Jan 2023 22:18:51 -0700 Subject: [PATCH 036/133] Missing newline in slurm.conf --- playbooks/roles/slurm/templates/slurm.conf.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index ac7a3747..0695f731 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -20,7 +20,9 @@ SlurmdLogFile=/var/log/slurm/slurmd.log StateSaveLocation={{ slurm_nfs_path }}/spool/slurm SwitchType=switch/none TaskPlugin=task/affinity{% if pam |bool %},task/cgroup{% endif %} + {% if pam |bool %}PrologFlags=contain{% endif %} + InactiveLimit=0 KillWait=30 MinJobAge=300 From 9f185c943a0ba45e7826721ee76e23e525b23f62 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 09:04:26 -0700 Subject: [PATCH 037/133] Add a log of the initial config on the bastion --- bastion.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bastion.tf b/bastion.tf index 25742782..1f9cab63 100644 --- a/bastion.tf +++ b/bastion.tf @@ -435,7 +435,7 @@ provisioner "file" { "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", - "timeout 2h /opt/oci-hpc/bin/configure.sh", + "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log", "exit_code=$?", "/opt/oci-hpc/bin/initial_monitoring.sh", "exit $exit_code" ] From 9144e9f4e036dcfc322bf8f086de0ddfe6ab7d44 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 09:04:50 -0700 Subject: [PATCH 038/133] Change to PAM to only run on compute nodes as sudo --- playbooks/roles/slurm/tasks/common.yml | 6 +----- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 4 ++++ playbooks/roles/slurm/tasks/compute.yml | 5 +++++ .../roles/slurm/tasks/{common_pam.yml => compute_pam.yml} | 1 + 4 files changed, 11 insertions(+), 5 deletions(-) rename playbooks/roles/slurm/tasks/{common_pam.yml => compute_pam.yml} (96%) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 1f62992a..4519a958 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -160,8 +160,4 @@ - name: Include pyxis prolog files include: common_pyxis.yml - when: pyxis|bool - -- name: Run Pam settings - include: common_pam.yml - when: pam|bool \ No newline at end of file + when: pyxis|bool \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 6afbca24..cae37224 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -1,4 +1,8 @@ --- +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: install SLURM compute packages vars: package_name: '{{ slurm_compute_packages }}' diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 943886cc..f59be98f 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -1,4 +1,9 @@ --- + +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: install SLURM compute packages vars: package_name: '{{ slurm_compute_packages }}' diff --git a/playbooks/roles/slurm/tasks/common_pam.yml b/playbooks/roles/slurm/tasks/compute_pam.yml similarity index 96% rename from playbooks/roles/slurm/tasks/common_pam.yml rename to playbooks/roles/slurm/tasks/compute_pam.yml index 70b88f36..0e4a29ff 100644 --- a/playbooks/roles/slurm/tasks/common_pam.yml +++ b/playbooks/roles/slurm/tasks/compute_pam.yml @@ -16,6 +16,7 @@ dest: /etc/pam.d/sshd - name: Stop logind + become: true systemd: name: systemd-logind state: stopped From f6d357f27f8f8485aace6212ad18842a84674b40 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 11:36:11 -0700 Subject: [PATCH 039/133] Add login output --- data.tf | 7 +++++++ locals.tf | 1 + outputs.tf | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/data.tf b/data.tf index 475d8b26..c90b6ced 100755 --- a/data.tf +++ b/data.tf @@ -70,4 +70,11 @@ data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reach count = (var.private_deployment && var.slurm_ha) ? 1 : 0 private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id private_ip = tostring(oci_core_instance.backup[0].private_ip) +} + +data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_login" { + #Required + count = (var.private_deployment && var.login_node) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.login[0].private_ip) } \ No newline at end of file diff --git a/locals.tf b/locals.tf index 99ca6466..372e1649 100755 --- a/locals.tf +++ b/locals.tf @@ -57,6 +57,7 @@ locals { bastion_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.private-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" + host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" timeout_per_batch= var.cluster_network ? 30 : 15 timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) diff --git a/outputs.tf b/outputs.tf index 7f95f526..b11d640f 100755 --- a/outputs.tf +++ b/outputs.tf @@ -8,4 +8,8 @@ output "private_ips" { output "backup" { value = var.slurm_ha ? local.host_backup : "No Slurm Backup Defined" +} + +output "login" { + value = var.login_node ? local.host_login : "No Login Node Defined" } \ No newline at end of file From daab6abbd649b4a9c1a21cbb59b3eb92c96303ec Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 11:36:31 -0700 Subject: [PATCH 040/133] Fix stack error code --- bastion.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bastion.tf b/bastion.tf index 1f9cab63..a6bf103d 100644 --- a/bastion.tf +++ b/bastion.tf @@ -436,7 +436,7 @@ provisioner "file" { "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log", - "exit_code=$?", + "exit_code=$${PIPESTATUS[0]}", "/opt/oci-hpc/bin/initial_monitoring.sh", "exit $exit_code" ] connection { From a4545ccf939af5b1a3f1add5b424faeeb75b587c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 11:37:22 -0700 Subject: [PATCH 041/133] Fix slurm login node error --- .../roles/slurm/tasks/compute-rack-aware.yml | 23 ++++++++++--------- playbooks/roles/slurm/tasks/compute.yml | 6 ++++- .../roles/slurm/tasks/destroy-rack-aware.yml | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index cae37224..49c96bcc 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -1,14 +1,11 @@ --- -- name: Run Pam settings - include: compute_pam.yml - when: pam|bool - name: install SLURM compute packages - vars: + vars: package_name: '{{ slurm_compute_packages }}' package_repo: "{{ slurm_repos }}" disable_gpg_check_var: True - include_role: + include_role: name: safe_yum - name: Render systemd units for slurm, slurmdbd and munge @@ -88,7 +85,7 @@ register: rackID_script - name: Set RackID fact - set_fact: + set_fact: rackID: "{{ rackID_script.stdout[1:-1]}}" - name: Get nodes from Inactive Switch @@ -102,7 +99,7 @@ - name: Get rackIDs for all compute nodes set_fact: racks_to_add_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: racks_to_add_temp_results @@ -113,7 +110,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results @@ -140,7 +137,7 @@ - name: Get hostlist if switch exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['(groups['login'])']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}" register: rack_hostlist1 delegate_to: 127.0.0.1 @@ -150,7 +147,7 @@ - name: Get hostlist if switch does not exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ new_line[:-1] }}" register: rack_hostlist2 delegate_to: 127.0.0.1 @@ -173,7 +170,7 @@ run_once: true delegate_to: 127.0.0.1 when: item.item.item.rc > 0 - + - name: Add the nodes in the rack switches become: true lineinfile: @@ -238,6 +235,10 @@ delegate_to: 127.0.0.1 when: racks_left_list | length > 0 +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: run handlers meta: flush_handlers diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index f59be98f..49febd49 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -87,7 +87,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results @@ -162,6 +162,10 @@ delegate_to: 127.0.0.1 notify: reconfigure slurm +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: start slurmd become: true service: diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index 68dc62a3..6c53f912 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -49,7 +49,7 @@ - name: Get hostnames set_fact: nodes_to_remove_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: nodes_to_remove_temp_results @@ -69,7 +69,7 @@ - name: Get rackIDs set_fact: racks_to_remove_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: racks_to_remove_temp_results From a433d0f0365fa9c0956ffdd79084f61749f27238 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 18 Jan 2023 15:09:13 -0800 Subject: [PATCH 042/133] added alias for validation script, name change, updated README --- README.md | 40 +++++++++++++++++ bin/{num_nodes_same.py => validation.py} | 57 +++++++++++++----------- playbooks/roles/slurm/tasks/server.yml | 6 +++ 3 files changed, 77 insertions(+), 26 deletions(-) rename bin/{num_nodes_same.py => validation.py} (93%) diff --git a/README.md b/README.md index 92eff436..8550f0bb 100644 --- a/README.md +++ b/README.md @@ -318,3 +318,43 @@ $ max_nodes --> Information about all the partitions and their respective cluste $ max_nodes --include_cluster_names xxx yyy zzz --> where xxx, yyy, zzz are cluster names. Provide a space separated list of cluster names to be considered for displaying the information about clusters and maximum number of nodes distributed evenly per partition + +## validation.py usage + +Use the alias "validate" to run the python script validation.py. You can run this script only from bastion. + +The script performs these checks. +-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion +-> PCIe bandwidth check +-> GPU Throttle check +-> Standalone /etc/hosts md5 sum validation + +Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] + +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE]: [-cn CLUSTER_NAMES] +Provide a file that lists each cluster on a separate line for which you want to validate the number of nodes and/or pcie check and/or gpu throttle check. + +Below are some examples for running this script. + +validate -n y --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. + +validate -n y -cn --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -p y --> This will run the pcie bandwidth check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. + +validate -p y -cn --> This will run the pcie bandwidth check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -p --> This will run the pcie bandwidth check on the hosts provided in the file given. The pcie host file should have a host name on each line. + +validate -g y --> This will run the GPU throttle check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. + +validate -g y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. + +You can combine all the options together such as: +validate -n y -p y -g y -cn + +If you only want to run the md5 sum check (matches that of bastion) on some hosts, use the -e option. Example: +validate -e --> Check whether md5 sum of /etc/hosts file matches that on bastion. The hosts considered will be from the file provided. The host file should have a host name on each line. + diff --git a/bin/num_nodes_same.py b/bin/validation.py similarity index 93% rename from bin/num_nodes_same.py rename to bin/validation.py index 12f2de3e..e7bd8302 100644 --- a/bin/num_nodes_same.py +++ b/bin/validation.py @@ -44,8 +44,6 @@ def run_cmd(cmd=None): stderr=subprocess.STDOUT, check=True, encoding='utf8') output = results.stdout.splitlines() except subprocess.CalledProcessError as e_process_error: - # print(f"!!! Error in running command [ {cmd} ]. Fatal error exiting!!!") - # print(f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") return (9000, f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") return output @@ -153,7 +151,7 @@ def getResizeClusterNames(filepath): # this is the source of truth for total number of nodes in a cluster -def getResizeNodes(metadata, cluster_names, mode): +def getResizeNodes(args, metadata, cluster_names, mode): if mode == 1 or mode == 2: resize_cluster_node_dict = {} str = "ocid1.instance." @@ -169,7 +167,7 @@ def getResizeNodes(metadata, cluster_names, mode): cluster_node_set.add(split_str[0].replace('"','')) if len(cluster_node_set) > 0: resize_cluster_node_dict.update({cluster: cluster_node_set}) - if mode == 2: + if mode == 2 or (mode == 1 and args.cluster_names is None): out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh list"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() x = stdout.split("\n") @@ -238,12 +236,12 @@ def slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path): node_name = split_str[2].replace('"','') proper_node_name = getConsoleNodeName(node_name) if proper_node_name is not None: - if node_state not in good_node_states: - warning_node_dict.update({proper_node_name: node_state}) if proper_node_name in all_node_cluster_dict: slurm_node_cluster = all_node_cluster_dict[proper_node_name] if slurm_node_cluster in resize_cluster_names: slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + if node_state not in good_node_states: + warning_node_dict.update({proper_node_name: node_state}) else: if path is None: path = createDir() @@ -427,7 +425,7 @@ def getResizeCluster(args, metadata): resize_cluster_node_dict = {} resize_node_cluster_dict = {} resize_cluster_names = getResizeClusterNames(args.cluster_names) - resize_cluster_names, resize_cluster_node_dict = getResizeNodes(metadata, resize_cluster_names, 1) + resize_cluster_names, resize_cluster_node_dict = getResizeNodes(args, metadata, resize_cluster_names, 1) if len(resize_cluster_names) == 0 or len(resize_cluster_node_dict) == 0: print("There are no clusters available") @@ -440,25 +438,32 @@ def getResizeCluster(args, metadata): ############### -parser = argparse.ArgumentParser(description = 'Perform these checks. \ - 1. Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ - 2. PCIe bandwidth check. \ - 3. GPU Throttle check \ - 4. Standalone /etc/hosts md5 sum validation \ - Options: \ - --cluster_names : Give a file that contains all the cluster names for option 1 and this will be considered as source of truth. \ - If not given, then the cluster names in the directory /opt/oci-hpc/autoscaling/clusters/ along with any permanent cluster associated \ - with the bastion will be considered as source of truth. ') -parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize.sh, /etc/hosts, slurm, topology.conf, OCI console, inventory files. \ - Also check /etc/hosts is same as bastion across all hosts. If -cn option is provided along with this, then that file will be considered. If not, nodes \ - resize will be considered. ") -parser.add_argument('-cn', '--cluster_names', help = "Provide a file that contains list of all cluster names for the above validation") -parser.add_argument('-p', '--pcie_file', help = "Provide a file that contains list of hosts on which to perform pcie check") -parser.add_argument('-g', '--gpu_throttle', help = "Provide a file that contains list of hosts on which to perform gpu throttle check") +parser = argparse.ArgumentParser(description = 'Performs these checks. \ +-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ + inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. \ +-> PCIe bandwidth check \ +-> GPU Throttle check \ +-> Standalone /etc/hosts md5 sum validation \ + Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] \ +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE]: [-cn CLUSTER_NAMES] --> \ +Provide a file that lists each cluster on a separate line for which you want to validate the \ + number of nodes and/or pcie check and/or gpu throttle check. ') + +parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ + inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion.") +parser.add_argument('-cn', '--cluster_names', help = "Provide a file that lists each cluster on a separate line for which you want to validate the \ + number of nodes and/or pcie check and/or gpu throttle check.") +parser.add_argument('-p', '--pcie', help = "Runs PCIe bandwidth check") +parser.add_argument('-g', '--gpu_throttle', help = "Performs GPU throttle check") parser.add_argument('-e', '--etc_hosts', help = "Provide a file that contains list of hosts on which to perform md5 sum check to match with bastion") args = parser.parse_args() +args_vars = vars(args) +if not any(args_vars.values()): + parser.error('No arguments provided') + exit() + metadata=get_metadata() path = None @@ -478,7 +483,7 @@ def getResizeCluster(args, metadata): all_cluster_node_dict = {} all_node_cluster_dict = {} all_cluster_names = getResizeClusterNames(None) - all_cluster_names, all_cluster_node_dict = getResizeNodes(metadata, all_cluster_names, 2) + all_cluster_names, all_cluster_node_dict = getResizeNodes(args, metadata, all_cluster_names, 2) if len(all_cluster_names) == 0 or len(all_cluster_node_dict) == 0: print("There are no clusters available") else: @@ -621,8 +626,8 @@ def getResizeCluster(args, metadata): etcHostsSame(nodes_comma, path) hostFileWritten = False -if args.pcie_file is not None: - if args.pcie_file == 'y' or args.pcie_file == 'Y': +if args.pcie is not None: + if args.pcie == 'y' or args.pcie == 'Y': if args.cluster_names is not None: if len(resize_node_cluster_dict) == 0: resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) @@ -641,7 +646,7 @@ def getResizeCluster(args, metadata): else: print("Provide cluster_names file or hosts file to run pcie check") else: - pcie_hostfile = args.pcie_file + pcie_hostfile = args.pcie if path is None: path = createDir() changeOwner(path) diff --git a/playbooks/roles/slurm/tasks/server.yml b/playbooks/roles/slurm/tasks/server.yml index 2c751f6c..33678c6a 100755 --- a/playbooks/roles/slurm/tasks/server.yml +++ b/playbooks/roles/slurm/tasks/server.yml @@ -166,6 +166,12 @@ line: alias max_nodes="python3 /opt/oci-hpc/bin/max_nodes_partition.py" state: present +- name: add alias for validation of number of nodes, pcie, and gpu throttle check + lineinfile: + path: '/home/{{ ansible_user }}/.bashrc' + line: alias validate="python3 /opt/oci-hpc/bin/validation.py" + state: present + - name: Generate gres.conf become: true template: From 3a1beddb3556c7910683147ca0e3ddbe9ec52e3f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 18 Jan 2023 18:07:16 -0700 Subject: [PATCH 043/133] Fix marketplace image issue on login node --- locals.tf | 2 +- marketplace.tf | 10 ++++---- schema.yaml | 67 +++++++++++++++++++++++++++++++++++++++++++++++++- variables.tf | 12 +++++++++ 4 files changed, 84 insertions(+), 7 deletions(-) diff --git a/locals.tf b/locals.tf index 372e1649..286cd62d 100755 --- a/locals.tf +++ b/locals.tf @@ -26,7 +26,7 @@ locals { bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid - login_image = var.use_standard_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + login_image = var.use_standard_image_login || var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid diff --git a/marketplace.tf b/marketplace.tf index ddc5140f..0c9d6bc7 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -2,10 +2,10 @@ locals { // listing_number = split(".", var.marketplace_listing)[0] mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_bastion_listing_id = var.use_standard_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" - mp_login_listing_id = var.use_standard_image_login ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" + mp_login_listing_id = var.use_marketplace_image_login ? var.use_old_marketplace_image_login ? var.old_marketplace_listing_id : substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing] mp_bastion_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] - mp_login_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] + mp_login_version_id = var.use_old_marketplace_image_login ? var.marketplace_version_id[split(".", var.old_marketplace_listing_login)[0]] : var.marketplace_version_id[var.marketplace_listing_login] } /* @@ -77,12 +77,12 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { - count = var.use_standard_image_login ? 1 : 0 + count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 listing_id = local.mp_login_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { - count = var.use_standard_image_login ? 1 : 0 + count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id @@ -90,7 +90,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_ima } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { - count = var.use_standard_image_login ? 1 : 0 + count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id diff --git a/schema.yaml b/schema.yaml index 5947c9f7..e9541b27 100755 --- a/schema.yaml +++ b/schema.yaml @@ -67,6 +67,10 @@ variableGroups: - ${login_memory} - ${login_boot_volume_size} - ${use_standard_image_login} + - ${use_marketplace_image_login} + - ${use_old_marketplace_image_login} + - ${marketplace_listing_login} + - ${old_marketplace_listing_login} - ${unsupported_login} - ${login_image_compartment} - ${custom_login_image} @@ -1362,6 +1366,8 @@ variables: - ${use_standard_image_login} - not: - ${unsupported_login} + - not: + - ${use_marketplace_image_login} required: true custom_login_image: @@ -1376,6 +1382,8 @@ variables: - ${use_standard_image_login} - not: - ${unsupported_login} + - not: + - ${use_marketplace_image_login} required: true unsupported_login_image: title: "Image OCID" @@ -1387,6 +1395,8 @@ variables: - ${unsupported_login} - not: - ${use_standard_image_login} + - not: + - ${use_marketplace_image_login} default: "image.ocid" login_username: @@ -1397,4 +1407,59 @@ variables: required: true visible: not: - - ${use_standard_image_login} \ No newline at end of file + - ${use_standard_image_login} + + use_marketplace_image_login: + type: boolean + title: "use marketplace image" + description: "Use marketplace image, otherwise provide custom image OCID" + default: true + visible: + not: + - ${use_standard_image_login} + use_old_marketplace_image_login: + type: boolean + title: "use older marketplace images" + description: "Images prior to September 2021" + default: false + visible: + and: + - ${use_marketplace_image_login} + - not: + - ${use_standard_image_login} + + marketplace_listing_login: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "HPC_OL7" + - "HPC_OL8" + - "GPU" + default: "HPC_OL7" + visible: + and: + - ${use_marketplace_image_login} + - not: + - ${use_old_marketplace_image_login} + - not: + - ${use_standard_image_login} + + old_marketplace_listing_login: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" + - "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826" + - "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229" + - "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" + default: "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" + visible: + and: + - ${use_marketplace_image_login} + - ${use_old_marketplace_image_login} + - not: + - ${use_standard_image_login} \ No newline at end of file diff --git a/variables.tf b/variables.tf index b4ce11ef..044cfa0b 100755 --- a/variables.tf +++ b/variables.tf @@ -231,3 +231,15 @@ variable "private_deployment" { default = false } variable "localdisk" { default = true } + +variable "use_marketplace_image_login" { default = true} +variable "use_old_marketplace_image_login" { default = false} + +variable "marketplace_listing_login" { + default = "HPC_OL7" +} + +variable "old_marketplace_listing_login" { + default = "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" +} + \ No newline at end of file From ce6d1d45fd0750e8ceff4c8d63a947ee91c51cb1 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 18 Jan 2023 18:21:14 -0800 Subject: [PATCH 044/133] fixed the wrong variable when writing to output file for topology --- bin/validation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/validation.py b/bin/validation.py index e7bd8302..82332e27 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -216,6 +216,8 @@ def getConsoleNodeName(slurm_node_name): stdout,stderr = out.communicate() node_name_output = stdout.split("\n") del node_name_output[-1] + if len(node_name_output) == 0: + return None return node_name_output[0] @@ -321,7 +323,7 @@ def topologyGetNodes(resize_cluster_names, all_node_cluster_dict, path): path = createDir() changeOwner(path) f = open(path+"/topoNumNodes.txt", "a") - f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") + f.write(n + " not found in /etc/hosts file for getting the oci console name" + "\n") f.close() return topo_node_cluster_dict, path From d83544dde3729a5928ab38a57dedb1a8d8145e87 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 19 Jan 2023 21:39:23 -0700 Subject: [PATCH 045/133] Fix resize in case of multiple vnics --- bin/resize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/resize.py b/bin/resize.py index b21ca6de..c88deec4 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -49,7 +49,9 @@ def get_instances(comp_ocid,cn_ocid,CN): for instance_summary in instance_summaries: try: instance=computeClient.get_instance(instance_summary.id).data - vnic_attachment = oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data[0] + for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: + if potential_vnic_attachment.display_name is None: + vnic_attachment = potential_vnic_attachment vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data except: continue From 552aee24658af40def7a0219c125a69269c5978d Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 20 Jan 2023 15:34:35 -0800 Subject: [PATCH 046/133] return path to output folder when running md5 sum standalone --- bin/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/validation.py b/bin/validation.py index 82332e27..2fc1669f 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -625,7 +625,7 @@ def getResizeCluster(args, metadata): x = stdout.split("\n") del x[-1] nodes_comma = ','.join(str(s) for s in x) - etcHostsSame(nodes_comma, path) + path = etcHostsSame(nodes_comma, path) hostFileWritten = False if args.pcie is not None: From 2bedd2eb061d9eee2d371316f2e484c0530cb02b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 23 Jan 2023 16:52:33 -0700 Subject: [PATCH 047/133] Add default variables when no login node --- schema.yaml | 4 ++-- variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/schema.yaml b/schema.yaml index e9541b27..87474a31 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1196,7 +1196,7 @@ variables: login_node: type: boolean title: "Login Node" - default: true + default: false description: "Create an additional login node for users" login_ad: @@ -1210,7 +1210,7 @@ variables: required: true description: "Availability Domain for login node" title: "Availability Domain For Login Node" - #default: ${ad} + default: ${ad} login_shape: type: oci:core:instanceshape:name diff --git a/variables.tf b/variables.tf index 044cfa0b..ddf41a63 100755 --- a/variables.tf +++ b/variables.tf @@ -50,9 +50,9 @@ variable "ssh_cidr" { default = "0.0.0.0/0" } variable "slurm" { default = false } variable "slurm_ha" { default = false } variable "login_node" { default = false } -variable "login_ad" {} +variable "login_ad" {default = ""} variable "login_shape" { default = "VM.Standard2.4" } -variable "login_boot_volume_size" {} +variable "login_boot_volume_size" {default = 50} variable "slurm_nfs" { default = false } variable "rack_aware" { default = false } variable "ldap" { default = true } From 5b0b72ba9b2d47bcfca40b02f470085ec80a10c0 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 23 Jan 2023 21:55:12 -0700 Subject: [PATCH 048/133] Add constrain Devices to cgroup --- playbooks/roles/slurm/files/cgroup.conf | 3 ++- playbooks/roles/slurm/templates/slurm.conf.j2 | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/playbooks/roles/slurm/files/cgroup.conf b/playbooks/roles/slurm/files/cgroup.conf index 804efb72..d7035a99 100755 --- a/playbooks/roles/slurm/files/cgroup.conf +++ b/playbooks/roles/slurm/files/cgroup.conf @@ -1 +1,2 @@ -CgroupAutomount=yes \ No newline at end of file +CgroupAutomount=yes +ConstrainDevices=yes \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 0695f731..e8e13ea3 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -19,10 +19,8 @@ SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log StateSaveLocation={{ slurm_nfs_path }}/spool/slurm SwitchType=switch/none -TaskPlugin=task/affinity{% if pam |bool %},task/cgroup{% endif %} - -{% if pam |bool %}PrologFlags=contain{% endif %} - +TaskPlugin=task/affinity,task/cgroup +PrologFlags=contain InactiveLimit=0 KillWait=30 MinJobAge=300 From 583adbf62c4d5f6a950c662a0f43973641be9127 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 16:13:58 -0700 Subject: [PATCH 049/133] Fix slurm when no /nfs/cluster is defined --- playbooks/roles/slurm/tasks/common.yml | 26 +++++++++++++++++++++++--- playbooks/site.yml | 2 +- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 4519a958..eb33decf 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -41,18 +41,38 @@ include_role: name: safe_yum +- name: Create Slurm RPM directory + file: + path: "{{ download_path }}/slurm_rpms" + state: directory + when: download_path == '/tmp' + - name: Create Slurm RPM directory file: path: "{{ download_path }}/slurm_rpms" state: directory delegate_to: 127.0.0.1 run_once: true + when: download_path != '/tmp' - name: Download slurm .deb get_url: url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" dest: "{{ download_path }}/slurm_rpms" - when: ansible_os_family == 'Debian' + when: ansible_os_family == 'Debian' and download_path == '/tmp' + +- name: Download slurm .rpm + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" + dest: "{{ download_path }}/slurm_rpms" + with_items: "{{slurm_all_packages}}" + when: ansible_os_family == 'RedHat' and download_path == '/tmp' + +- name: Download slurm .deb + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: ansible_os_family == 'Debian' and download_path != '/tmp' delegate_to: 127.0.0.1 run_once: true @@ -61,9 +81,9 @@ url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" dest: "{{ download_path }}/slurm_rpms" with_items: "{{slurm_all_packages}}" - when: ansible_os_family == 'RedHat' + when: ansible_os_family == 'RedHat' and download_path != '/tmp' delegate_to: 127.0.0.1 - run_once: true + run_once: true - name: Install .deb become: true diff --git a/playbooks/site.yml b/playbooks/site.yml index 976c047b..c7a02ed1 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -296,7 +296,7 @@ vars: destroy: false initial: true - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ cluster_nfs_path if cluster_nfs|bool else (nfs_target_path if create_fss | bool else '/tmp') }}" resize: false enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: From 1d283e2fb6e2d6cf7090b4e7b45a3f68424acc65 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 21:43:01 -0700 Subject: [PATCH 050/133] Fix comment on additional block volume on login --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index 87474a31..a0061a68 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1311,7 +1311,7 @@ variables: login_block: type: boolean - title: Additional block volume for shared space + title: Additional block volume for login node default: false visible: ${login_node} From 2295638fd3241baf53142a152ced146f15d072fd Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 21:43:17 -0700 Subject: [PATCH 051/133] Fix doc on cluster user add name --nossh --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 92eff436..fd8c2029 100644 --- a/README.md +++ b/README.md @@ -293,8 +293,8 @@ Example of cluster command to add a new user: ```cluster user add name``` By default, a `privilege` group is created that has access to the NFS and can have sudo access on all nodes (Defined at the stack creation. This group has ID 9876) The group name can be modified. ```cluster user add name --gid 9876``` -To generate a user-specific key for passwordless ssh between nodes, use --ssh. -```cluster user add name --ssh --gid 9876``` +To avoid generating a user-specific key for passwordless ssh between nodes, use --nossh. +```cluster user add name --nossh --gid 9876``` # Shared home folder From a5c9a969f225eca9e7183ed11ab3c32470ddc59d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 23:03:27 -0700 Subject: [PATCH 052/133] Make the Slurm RPM creation as sudo --- playbooks/roles/slurm/tasks/common.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index eb33decf..32dd8ee9 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -42,15 +42,21 @@ name: safe_yum - name: Create Slurm RPM directory + become: true file: path: "{{ download_path }}/slurm_rpms" state: directory + owner: opc + group: opc when: download_path == '/tmp' - name: Create Slurm RPM directory + become: true file: path: "{{ download_path }}/slurm_rpms" state: directory + owner: opc + group: opc delegate_to: 127.0.0.1 run_once: true when: download_path != '/tmp' From 4d34f40b2a2cd70eb09eb56c85509ce91b97e721 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 23:03:50 -0700 Subject: [PATCH 053/133] Change download path for Slurm in case of FSS --- playbooks/new_nodes.yml | 2 +- playbooks/resize_add.yml | 2 +- playbooks/site.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index f7f7e62a..d24c173b 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -177,7 +177,7 @@ vars: destroy: false initial: false - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 4e01b939..296d017c 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -179,7 +179,7 @@ vars: destroy: false initial: false - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" diff --git a/playbooks/site.yml b/playbooks/site.yml index c7a02ed1..745e3957 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -296,7 +296,7 @@ vars: destroy: false initial: true - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else (nfs_target_path if create_fss | bool else '/tmp') }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" resize: false enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: From 38c1b8900b5ae99250c035bef1dfcffe428dbbf6 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 23:19:07 -0700 Subject: [PATCH 054/133] Add description of use_advanced --- schema.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/schema.yaml b/schema.yaml index a0061a68..7a071eaf 100755 --- a/schema.yaml +++ b/schema.yaml @@ -715,6 +715,8 @@ variables: type: boolean title: "Show advanced storage options" default: false + description: "Including running home on FSS." + visible: true use_scratch_nfs: type: boolean From 8f2677b5d39cdbe57e0f844b4bb6b14c24c6523d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 24 Jan 2023 23:19:43 -0700 Subject: [PATCH 055/133] Make sure home_nfs is run also when using FSS --- playbooks/site.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/site.yml b/playbooks/site.yml index 745e3957..57d6d543 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -125,7 +125,7 @@ tasks: - include_role: name: home_nfs - when: home_nfs|default(true)|bool + when: home_nfs|default(true)|bool or home_fss|bool - include_role: name: nfs-client vars: From 078e2ba6c52946d7fa930a783a9f9e21c457479d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 25 Jan 2023 17:18:49 -0700 Subject: [PATCH 056/133] Adding VM.DenseIO.E4. flex to the supported shapes --- autoscaling/tf_init/bastion_update.tf | 2 +- autoscaling/tf_init/locals.tf | 5 +- bastion.tf | 6 +- locals.tf | 11 +++- playbooks/roles/slurm/templates/slurm.conf.j2 | 4 ++ schema.yaml | 56 +++++++++++++++++++ slurm_ha.tf | 4 +- variables.tf | 3 + 8 files changed, 81 insertions(+), 10 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index a4cfc58c..b574bac4 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -55,7 +55,7 @@ resource "local_file" "inventory" { cluster_mount_ip = local.mount_ip, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus=var.instance_pool_ocpus, + instance_pool_ocpus=local.instance_pool_ocpus, queue=var.queue, instance_type=var.instance_type, autoscaling_monitoring = var.autoscaling_monitoring, diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf index f4416844..283f3245 100755 --- a/autoscaling/tf_init/locals.tf +++ b/autoscaling/tf_init/locals.tf @@ -3,6 +3,9 @@ locals { cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name image_ocid = var.unsupported ? var.image_ocid : var.image + + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape + instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -20,7 +23,7 @@ locals { // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id // is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] // bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" diff --git a/bastion.tf b/bastion.tf index a6bf103d..9442aa21 100644 --- a/bastion.tf +++ b/bastion.tf @@ -234,8 +234,8 @@ resource "null_resource" "cluster" { cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, cluster_name = local.cluster_name, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus = var.instance_pool_ocpus, + shape = local.shape, + instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, monitoring = var.monitoring, hyperthreading = var.hyperthreading, @@ -311,7 +311,7 @@ resource "null_resource" "cluster" { private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, targetCompartment = var.targetCompartment, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, queue=var.queue, diff --git a/locals.tf b/locals.tf index 286cd62d..dc3b2d87 100755 --- a/locals.tf +++ b/locals.tf @@ -7,6 +7,10 @@ locals { custom_bastion_image_ocid = var.unsupported_bastion ? var.unsupported_bastion_image : var.custom_bastion_image custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape + instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus + bastion_ocpus = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? var.bastion_ocpus_denseIO_flex : var.bastion_ocpus + login_ocpus = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? var.login_ocpus_denseIO_flex : var.login_ocpus // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -34,10 +38,10 @@ locals { // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id - is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] - is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [var.login_ocpus]:[] + is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [local.bastion_ocpus]:[] + is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" @@ -61,4 +65,5 @@ locals { timeout_per_batch= var.cluster_network ? 30 : 15 timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) + } diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index e8e13ea3..83bdc4d6 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -84,12 +84,16 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard3.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "VM.DenseIO.E4.Flex" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.A1.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.E3.128" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.E4.128" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.DenseIO.E4.128" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC2.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC.E5.128" %} diff --git a/schema.yaml b/schema.yaml index 7a071eaf..6465b5cf 100755 --- a/schema.yaml +++ b/schema.yaml @@ -30,6 +30,7 @@ variableGroups: - ${bastion_ad} - ${bastion_shape} - ${bastion_ocpus} + - ${bastion_ocpus_denseIO_flex} - ${bastion_custom_memory} - ${bastion_memory} - ${bastion_boot_volume_size} @@ -43,6 +44,7 @@ variableGroups: - ${cluster_network_shape} - ${instance_pool_shape} - ${instance_pool_ocpus} + - ${instance_pool_ocpus_denseIO_flex} - ${instance_pool_custom_memory} - ${instance_pool_memory} - ${node_count} @@ -257,6 +259,7 @@ variables: required: true default: VM.Standard2.4 bastion_ocpus: + title: "Cores" type: integer description: Number of OCPU's for flex shape minimum: 1 @@ -281,6 +284,24 @@ variables: - ${bastion_shape} - "VM.Standard3.Flex" required: true + + bastion_ocpus_denseIO_flex: + title: "Cores" + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${instance_pool_shape} + - "VM.DenseIO.E4.Flex" + required: true + bastion_custom_memory: title: Use custom memory size type: boolean @@ -533,6 +554,23 @@ variables: - "VM.Standard3.Flex" required: true + instance_pool_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${instance_pool_shape} + - "VM.DenseIO.E4.Flex" + required: true + instance_pool_custom_memory: title: Use custom memory size type: boolean @@ -1249,6 +1287,24 @@ variables: - ${login_node} required: true + login_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.DenseIO.E4.Flex" + - ${login_node} + required: true + login_custom_memory: title: Use custom memory size type: boolean diff --git a/slurm_ha.tf b/slurm_ha.tf index 3c008673..388e76ec 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -229,7 +229,7 @@ resource "null_resource" "cluster_backup" { autoscaling = var.autoscaling, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, monitoring = var.monitoring, hyperthreading = var.hyperthreading, @@ -305,7 +305,7 @@ resource "null_resource" "cluster_backup" { private_subnet = var.private_subnet, private_subnet_id = var.private_subnet_id, targetCompartment = var.targetCompartment, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, queue=var.queue, diff --git a/variables.tf b/variables.tf index ddf41a63..620d2eb9 100755 --- a/variables.tf +++ b/variables.tf @@ -58,10 +58,13 @@ variable "rack_aware" { default = false } variable "ldap" { default = true } variable "spack" { default = false } variable "bastion_ocpus" { default = 2} +variable "bastion_ocpus_denseIO_flex" { default = 8} variable "instance_pool_ocpus" { default = 2} +variable "instance_pool_ocpus_denseIO_flex" { default = 8} variable "instance_pool_memory" { default = 16 } variable "instance_pool_custom_memory" { default = false } variable "login_ocpus" { default = 2} +variable "login_ocpus_denseIO_flex" { default = 2} variable "bastion_memory" { default = 16 } variable "bastion_custom_memory" { default = false } variable "login_memory" { default = 16 } From b5193e4dea32ef833cef52af4b04ec9a6393bd91 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 25 Jan 2023 22:42:00 -0800 Subject: [PATCH 057/133] fixes for VM.DenseIO.E4.Flex shape --- conf/variables.tpl | 4 +++- schema.yaml | 3 ++- variables.tf | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index e028cf41..c5d7e9e7 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -120,4 +120,6 @@ variable "use_multiple_ads" { default = ${use_multiple_ads} } variable "bastion_username" { default = "${bastion_username}" } variable "compute_username" { default = "${compute_username}" } -variable "localdisk" { default = "${localdisk}" } \ No newline at end of file +variable "localdisk" { default = "${localdisk}" } + +variable "instance_pool_ocpus_denseIO_flex" { default = "##OCPU##"} diff --git a/schema.yaml b/schema.yaml index 6465b5cf..c10342a5 100755 --- a/schema.yaml +++ b/schema.yaml @@ -65,6 +65,7 @@ variableGroups: - ${login_ad} - ${login_shape} - ${login_ocpus} + - ${login_ocpus_denseIO_flex} - ${login_custom_memory} - ${login_memory} - ${login_boot_volume_size} @@ -298,7 +299,7 @@ variables: and: - or: - eq: - - ${instance_pool_shape} + - ${bastion_shape} - "VM.DenseIO.E4.Flex" required: true diff --git a/variables.tf b/variables.tf index 620d2eb9..73b3cba2 100755 --- a/variables.tf +++ b/variables.tf @@ -64,7 +64,7 @@ variable "instance_pool_ocpus_denseIO_flex" { default = 8} variable "instance_pool_memory" { default = 16 } variable "instance_pool_custom_memory" { default = false } variable "login_ocpus" { default = 2} -variable "login_ocpus_denseIO_flex" { default = 2} +variable "login_ocpus_denseIO_flex" { default = 8} variable "bastion_memory" { default = 16 } variable "bastion_custom_memory" { default = false } variable "login_memory" { default = 16 } From f82cea09fb30782396c0bfbd224655d2d8988030 Mon Sep 17 00:00:00 2001 From: bruno-garbaccio Date: Thu, 26 Jan 2023 11:22:49 +0100 Subject: [PATCH 058/133] generalized no instance principal change schema.yaml for more explicit indication Update no_instance_principal role for all user names --- .../tasks/{el.yml => common.yml} | 12 ++++++------ playbooks/roles/no_instance_principal/tasks/main.yml | 4 ++-- schema.yaml | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) rename playbooks/roles/no_instance_principal/tasks/{el.yml => common.yml} (62%) diff --git a/playbooks/roles/no_instance_principal/tasks/el.yml b/playbooks/roles/no_instance_principal/tasks/common.yml similarity index 62% rename from playbooks/roles/no_instance_principal/tasks/el.yml rename to playbooks/roles/no_instance_principal/tasks/common.yml index 33ede7b3..cae6bc29 100755 --- a/playbooks/roles/no_instance_principal/tasks/el.yml +++ b/playbooks/roles/no_instance_principal/tasks/common.yml @@ -2,20 +2,20 @@ - name: create .oci directory become: true file: - path: /home/opc/.oci + path: /home/{{ ansible_user }}/.oci state: directory - owner: opc - group: "{{privilege_group_name}}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" mode: 0775 - name: Generate config file become: true template: src: 'config.j2' - dest: '/home/opc/.oci/config' + dest: '/home/{{ ansible_user }}/.oci/config' mode: 0600 - owner: opc - group: "{{privilege_group_name}}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" - name: delete --auth in create_cluster.sh become: true diff --git a/playbooks/roles/no_instance_principal/tasks/main.yml b/playbooks/roles/no_instance_principal/tasks/main.yml index 01de0d59..270202fc 100755 --- a/playbooks/roles/no_instance_principal/tasks/main.yml +++ b/playbooks/roles/no_instance_principal/tasks/main.yml @@ -1,3 +1,3 @@ -- include: el.yml - when: ansible_os_family == 'RedHat' +- include: common.yml + diff --git a/schema.yaml b/schema.yaml index 6465b5cf..7590be88 100755 --- a/schema.yaml +++ b/schema.yaml @@ -86,7 +86,7 @@ variableGroups: - ${autoscaling} - ${autoscaling_monitoring} - ${latency_check} - - title: "API authentication" + - title: "API authentication, needed for autoscaling" variables: - ${inst_prin} - ${api_user_ocid} @@ -1048,7 +1048,7 @@ variables: inst_prin: type: boolean - title: "Use Instance Principal (required for autoscaling)" + title: "Use Instance Principal instead of configuration file" description: "You will need to set a dynamic group and policy to allow the bastion to authenticate. This will not be created automatically." default: true From ed28217e531718f5b6d167af251ec1e6d185862d Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 27 Jan 2023 15:29:29 -0800 Subject: [PATCH 059/133] fixed sssd --- playbooks/roles/slurm/tasks/common.yml | 8 ++--- playbooks/roles/sssd/tasks/el-8.yml | 48 ++++++++++++++++++++++++++ playbooks/roles/sssd/tasks/main.yml | 6 ++++ playbooks/site.yml | 4 --- 4 files changed, 58 insertions(+), 8 deletions(-) create mode 100644 playbooks/roles/sssd/tasks/el-8.yml diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 32dd8ee9..53afa054 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -46,8 +46,8 @@ file: path: "{{ download_path }}/slurm_rpms" state: directory - owner: opc - group: opc + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' when: download_path == '/tmp' - name: Create Slurm RPM directory @@ -55,8 +55,8 @@ file: path: "{{ download_path }}/slurm_rpms" state: directory - owner: opc - group: opc + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' delegate_to: 127.0.0.1 run_once: true when: download_path != '/tmp' diff --git a/playbooks/roles/sssd/tasks/el-8.yml b/playbooks/roles/sssd/tasks/el-8.yml new file mode 100644 index 00000000..ecfc4255 --- /dev/null +++ b/playbooks/roles/sssd/tasks/el-8.yml @@ -0,0 +1,48 @@ +--- +- name: Install sssd packages + vars: + package_name: + - sssd + - authconfig + include_role: + name: safe_yum + +- name: Add configuration file to /etc/sssd/sssd.conf + template: + src: 'sssd.conf.j2' + dest: '/etc/sssd/sssd.conf' + owner: 'root' + group: 'root' + mode: '0600' + notify: restart sssd + +- name: Copy CA certificate + copy: + src: "{{ ssl_ca_cert }}" + dest: /etc/openldap/certs/cluster-ca.crt + +- name: Adjust OpenLDAP client TLS configuration + lineinfile: + path: '/etc/openldap/ldap.conf' + line: 'TLS_CACERT /etc/openldap/certs/cluster-ca.crt' + +- name: Enable sssd service + systemd: + name: sssd + enabled: "yes" + +- name: Start sssd service + systemd: + name: sssd + state: started + +- name: Update sshd configuration + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PasswordAuthentication' + line: PasswordAuthentication no + notify: restart sshd + +- name: Setting up the system to use sssd for authentication + command: authconfig --enablemkhomedir --enablesssd --enablesssdauth --update + changed_when: false diff --git a/playbooks/roles/sssd/tasks/main.yml b/playbooks/roles/sssd/tasks/main.yml index 6b221d24..acad08c2 100644 --- a/playbooks/roles/sssd/tasks/main.yml +++ b/playbooks/roles/sssd/tasks/main.yml @@ -1,5 +1,11 @@ +- include_vars: /opt/oci-hpc/playbooks/roles/openldap/vars/debian_vars.yml + when: ansible_distribution == 'Ubuntu' + - include: el-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' +- include: el-8.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' + - include: debian.yml when: ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index 57d6d543..f5f227f1 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -236,10 +236,6 @@ - hosts: all become: true - vars_files: - - "/opt/oci-hpc/playbooks/roles/openldap/defaults/main.yml" - - "/opt/oci-hpc/playbooks/roles/openldap/vars/el_vars.yml" - - "/opt/oci-hpc/playbooks/roles/openldap/vars/debian_vars.yml" tasks: - include_role: name: sssd From 9b67d47e67b96ae6c8814876c309a7d276d5119a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 31 Jan 2023 09:58:08 -0700 Subject: [PATCH 060/133] Change the exit code and tme to add instance pool --- bin/resize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/resize.py b/bin/resize.py index c88deec4..f1c6d84a 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -653,7 +653,7 @@ def updateTFState(inventory,cluster_name,size): hostnames_to_remove=[i['display_name'] for i in unreachable_instances] else: print("STDOUT: No list of nodes were specified and no unreachable nodes were found") - exit() + exit(1) else: reachable_instances,unreachable_instances=getreachable(inventory_instances,username,delay=10) hostnames_to_remove=hostnames @@ -665,7 +665,7 @@ def updateTFState(inventory,cluster_name,size): if not remove_unreachable: print("STDOUT: At least one unreachable node is in the inventory") print("STDOUT: Not doing anything") - exit() + exit(1) else: hostnames_to_remove=[i['display_name'] for i in unreachable_instances] else: @@ -692,7 +692,7 @@ def updateTFState(inventory,cluster_name,size): if error_code != 0: print("STDOUT: The nodes could not be removed. Try running this with Force") if not force: - exit() + exit(1) else: print("STDOUT: Force deleting the nodes") while len(hostnames_to_remove) > 0: @@ -724,7 +724,7 @@ def updateTFState(inventory,cluster_name,size): if args.mode == 'add': size = current_size - hostnames_to_remove_len + args.number update_size = oci.core.models.UpdateInstancePoolDetails(size=size) - ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING']) + ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING'],waiter_kwargs={'max_wait_seconds':3600}) updateTFState(inventory,cluster_name,size) if not no_reconfigure: add_reconfigure(comp_ocid,cn_ocid,inventory,CN) \ No newline at end of file From 153a6e4ebe2bd94740b9c6a802348c99530696c0 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 31 Jan 2023 09:58:42 -0700 Subject: [PATCH 061/133] Add a retry on getting the rackid --- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 49c96bcc..e001b18c 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -83,6 +83,9 @@ shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 register: rackID_script + retries: 5 + delay: 5 + until: rackID_script is not failed - name: Set RackID fact set_fact: From e547ac0dfe73220b4130494db83a58d306fbfcd3 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 31 Jan 2023 09:59:14 -0700 Subject: [PATCH 062/133] Use the RDMA subnet var in the oci-rdma-configure --- playbooks/roles/rdma-interface/tasks/debian.yml | 9 ++++++++- playbooks/roles/rdma-interface/tasks/el.yml | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/rdma-interface/tasks/debian.yml b/playbooks/roles/rdma-interface/tasks/debian.yml index 069b7de2..42d28db8 100644 --- a/playbooks/roles/rdma-interface/tasks/debian.yml +++ b/playbooks/roles/rdma-interface/tasks/debian.yml @@ -18,13 +18,20 @@ return_content: yes register: i_metadata +- name: Change CIDR range for RDMA + become: true + replace: + path: /etc/oci-hpc/rdma-network.conf + regexp: 'rdma_network=192.168.0.0/255.255.0.0' + replace: 'rdma_network={{rdma_network}}/{{rdma_netmask}}' + - name: Append subnet part to /etc/oci-hpc/rdma-network.conf blockinfile: path: /etc/oci-hpc/rdma-network.conf block: | [subnet] modify_arp=true - override_netconfig_netmask=255.255.0.0 + override_netconfig_netmask={{rdma_netmask}} when: new_image.stat.exists - name: Start the OCI RDMA service diff --git a/playbooks/roles/rdma-interface/tasks/el.yml b/playbooks/roles/rdma-interface/tasks/el.yml index 2d5ee6e6..e37e2ce4 100755 --- a/playbooks/roles/rdma-interface/tasks/el.yml +++ b/playbooks/roles/rdma-interface/tasks/el.yml @@ -22,13 +22,20 @@ path: /sbin/oci-rdma-configure register: new_image +- name: Change CIDR range for RDMA + become: true + replace: + path: /etc/oci-hpc/rdma-network.conf + regexp: 'rdma_network=192.168.0.0/255.255.0.0' + replace: 'rdma_network={{rdma_network}}/{{rdma_netmask}}' + - name: Append subnet part to /etc/oci-hpc/rdma-network.conf blockinfile: path: /etc/oci-hpc/rdma-network.conf block: | [subnet] modify_arp=true - override_netconfig_netmask=255.255.0.0 + override_netconfig_netmask={{rdma_netmask}} when: new_image.stat.exists - name: Start the OCI RDMA service From dcf385779e08ee66647cd4bd2c6fc0613e05aa68 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 1 Feb 2023 12:34:24 -0800 Subject: [PATCH 063/133] modified slurm to find nodes ending with *, optimized code lneth, not running slurm and topo check if etchosts doesn't match --- bin/validation.py | 266 ++++++++++++++++------------------------------ 1 file changed, 91 insertions(+), 175 deletions(-) diff --git a/bin/validation.py b/bin/validation.py index 2fc1669f..0732aa5f 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -223,13 +223,10 @@ def getConsoleNodeName(slurm_node_name): # get number of nodes and their state using slurm def slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path): - out = subprocess.run(['sinfo','-hNr','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + out = subprocess.run(['sinfo','-hN','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) lines = out.stdout.decode("utf-8") x = lines.split("\n") del x[-1] - good_node_states = set() - good_node_states.add("allocated") - good_node_states.add("idle") warning_node_dict = {} slurm_node_cluster_dict = {} for i in range(len(x)): @@ -242,7 +239,7 @@ def slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path): slurm_node_cluster = all_node_cluster_dict[proper_node_name] if slurm_node_cluster in resize_cluster_names: slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) - if node_state not in good_node_states: + if node_state.endswith("*"): warning_node_dict.update({proper_node_name: node_state}) else: if path is None: @@ -438,26 +435,99 @@ def getResizeCluster(args, metadata): return resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict +def dictEqualCheck(resize_node_cluster_dict, comp_dict, type, txt_file_name, path): + if resize_node_cluster_dict == comp_dict: + print("Number of nodes from " +type+ " is same as resize") + else: + for key in resize_node_cluster_dict.keys(): + if not key in comp_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/" +txt_file_name+ ".txt", "a") + f.write(key + " is not in " +type+ " file" + "\n") + f.close() + for key in comp_dict.keys(): + if not key in resize_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/" +txt_file_name+ ".txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() + return path + +def runChecks(args, type, name, hostFileWritten, resize_node_cluster_dict, metadata, path): + if type is not None: + if type == 'y' or type == 'Y': + if args.cluster_names is not None: + if hostFileWritten is False: + if len(resize_node_cluster_dict) == 0: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + if len(resize_cluster_names) == 0: + exit() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/host.txt", "a") + for v in resize_node_cluster_dict.keys(): + hostFileWritten = True + f.write(str(v) + "\n") + f.close() + hostfile = path+"/host.txt" + if name == "pcie": + pcie_check(hostfile, path) + if name == "gpu throttle": + gpu_throttle(hostfile, path) + if name == "/etc/hosts md5 sum": + out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + nodes_comma = ','.join(str(s) for s in x) + path = etcHostsSame(nodes_comma, path) + else: + print("Provide cluster_names file or hosts file to run " +name+ " check") + else: + hostfile = type + if path is None: + path = createDir() + changeOwner(path) + if name == "pcie": + pcie_check(hostfile, path) + if name == "gpu throttle": + gpu_throttle(hostfile, path) + if name == "/etc/hosts md5 sum": + out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + nodes_comma = ','.join(str(s) for s in x) + path = etcHostsSame(nodes_comma, path) + return hostFileWritten, path + + ############### parser = argparse.ArgumentParser(description = 'Performs these checks. \ -> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ - inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. \ + inventory files. \ -> PCIe bandwidth check \ -> GPU Throttle check \ --> Standalone /etc/hosts md5 sum validation \ +-> /etc/hosts md5 sum validation \ Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] \ -Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE]: [-cn CLUSTER_NAMES] --> \ +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]: [-cn CLUSTER_NAMES] --> \ Provide a file that lists each cluster on a separate line for which you want to validate the \ - number of nodes and/or pcie check and/or gpu throttle check. ') + number of nodes and/or pcie check and/or gpu throttle check and/or /etc/hosts md5 sum. \ +For all of the above, you can either provide y or Y along with -cn or you can give the hostfile path (each host on a separate line) for each argument') parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ - inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion.") + inventory files.") parser.add_argument('-cn', '--cluster_names', help = "Provide a file that lists each cluster on a separate line for which you want to validate the \ number of nodes and/or pcie check and/or gpu throttle check.") parser.add_argument('-p', '--pcie', help = "Runs PCIe bandwidth check") parser.add_argument('-g', '--gpu_throttle', help = "Performs GPU throttle check") -parser.add_argument('-e', '--etc_hosts', help = "Provide a file that contains list of hosts on which to perform md5 sum check to match with bastion") +parser.add_argument('-e', '--etc_hosts', help = "Performs md5 sum check on all hosts and checks if it matches with the bastion") args = parser.parse_args() @@ -503,45 +573,13 @@ def getResizeCluster(args, metadata): oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) + path = dictEqualCheck(resize_node_cluster_dict, etc_node_cluster_dict, "/etc/hosts", "etcHostsNumNodes", path) if resize_node_cluster_dict == etc_node_cluster_dict: - print("Number of nodes in /etc/hosts on bastion is same as resize") - else: - for key in resize_node_cluster_dict.keys(): - if not key in etc_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/etcHostsNumNodes.txt", "a") - f.write(key + " is not in etc hosts" + "\n") - f.close() - for key in etc_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/etcHostsNumNodes.txt", "a") - f.write(key + " is not in resize list" + "\n") - f.close() + path = dictEqualCheck(resize_node_cluster_dict, slurm_node_cluster_dict, "slurm", "slurmNumNodes", path) + path = dictEqualCheck(resize_node_cluster_dict, topo_node_cluster_dict, "topology.conf", "topoNumNodes", path) - if resize_node_cluster_dict == slurm_node_cluster_dict: - print("Number of nodes from slurm is same as resize") - else: - for key in resize_node_cluster_dict.keys(): - if not key in slurm_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/slurmNumNodes.txt", "a") - f.write(key + " is not in slurm" + "\n") - f.close() - for key in slurm_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/slurmNumNodes.txt", "a") - f.write(key + " is not in resize list" + "\n") - f.close() + path = dictEqualCheck(resize_node_cluster_dict, inventory_node_cluster_dict, "inventory", "inventoryNumNodes", path) + path = dictEqualCheck(resize_node_cluster_dict, oci_node_cluster_dict, "oci cli", "ociCliNumNodes", path) if len(warning_node_dict) > 0: for key in warning_node_dict.keys(): @@ -551,134 +589,12 @@ def getResizeCluster(args, metadata): f = open(path+"/slurmWarnNodes.txt", "a") f.write(key + " is in slurm state " + warning_node_dict[key] + "\n") f.close() - - if resize_node_cluster_dict == topo_node_cluster_dict: - print("Number of nodes from topology is same as resize") - else: - for key in resize_node_cluster_dict.keys(): - if not key in topo_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/topoNumNodes.txt", "a") - f.write(key + " is not in topology.conf file" + "\n") - f.close() - for key in topo_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/topoNumNodes.txt", "a") - f.write(key + " is not in resize list" + "\n") - f.close() - - if resize_node_cluster_dict == inventory_node_cluster_dict: - print("Number of nodes from inventory is same as resize") - else: - for key in resize_node_cluster_dict.keys(): - if not key in inventory_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/inventoryNumNodes.txt", "a") - f.write(key + " is not in inventory file" + "\n") - f.close() - for key in inventory_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/inventoryNumNodes.txt", "a") - f.write(key + " is not in resize list" + "\n") - f.close() - - if resize_node_cluster_dict == oci_node_cluster_dict: - print("Number of nodes from oci cli is same as resize") - else: - for key in resize_node_cluster_dict.keys(): - if not key in oci_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/ociCliNumNodes.txt", "a") - f.write(key + " not found using oci cli" + "\n") - f.close() - for key in oci_node_cluster_dict.keys(): - if not key in resize_node_cluster_dict: - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/ociCliNumNodes.txt", "a") - f.write(key + " is not in resize list" + "\n") - f.close() - - node_list = list(map(' '.join, resize_cluster_node_dict.values())) - nodes_space = ' '.join(str(s) for s in node_list) - split_str = nodes_space.split() - nodes_comma = ','.join(str(s) for s in split_str) - path = etcHostsSame(nodes_comma, path) - -if args.num_nodes is None and args.etc_hosts is not None: - hostfile = args.etc_hosts - out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - x = stdout.split("\n") - del x[-1] - nodes_comma = ','.join(str(s) for s in x) - path = etcHostsSame(nodes_comma, path) - + hostFileWritten = False -if args.pcie is not None: - if args.pcie == 'y' or args.pcie == 'Y': - if args.cluster_names is not None: - if len(resize_node_cluster_dict) == 0: - resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) - if len(resize_cluster_names) == 0: - exit() - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/host.txt", "a") - for v in resize_node_cluster_dict.keys(): - hostFileWritten = True - f.write(str(v) + "\n") - f.close() - pcie_hostfile = path+"/host.txt" - pcie_check(pcie_hostfile, path) - else: - print("Provide cluster_names file or hosts file to run pcie check") - else: - pcie_hostfile = args.pcie - if path is None: - path = createDir() - changeOwner(path) - pcie_check(pcie_hostfile, path) - -if args.gpu_throttle is not None: - if args.gpu_throttle == 'y' or args.gpu_throttle == 'Y': - if args.cluster_names is not None: - if hostFileWritten is False: - if len(resize_node_cluster_dict) == 0: - resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) - if len(resize_cluster_names) == 0: - exit() - if path is None: - path = createDir() - changeOwner(path) - f = open(path+"/host.txt", "a") - for v in resize_node_cluster_dict.keys(): - f.write(str(v) + "\n") - f.close() - gpu_hostfile = path+"/host.txt" - gpu_throttle(gpu_hostfile, path) - else: - print("Provide cluster_names file or hosts file to run gpu throttle check") - else: - gpu_hostfile = args.gpu_throttle - if path is None: - path = createDir() - changeOwner(path) - gpu_throttle(gpu_hostfile, path) + +hostFileWritten, path = runChecks(args, args.pcie, "pcie", hostFileWritten, resize_node_cluster_dict, metadata, path) +hostFileWritten, path = runChecks(args, args.gpu_throttle, "gpu throttle",hostFileWritten, resize_node_cluster_dict, metadata, path) +hostFileWritten, path = runChecks(args, args.etc_hosts, "/etc/hosts md5 sum", hostFileWritten, resize_node_cluster_dict, metadata, path) if path is not None: print(f"Output is in folder: {path}") From 751a5b38dd00fd78d06defe48ce177b284dc27eb Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 2 Feb 2023 10:47:15 -0700 Subject: [PATCH 064/133] Fix autoscaling when a node is inactive and active --- autoscaling/crontab/autoscale_slurm.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 5bbee65d..0ccd87c3 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -169,6 +169,11 @@ def getClusterName(node): for output in stdout.split('\n')[:-1]: if "Switches=" in output: clusterName=output.split()[0].split('SwitchName=')[1] + break + elif "SwitchName=inactive-" in output: + continue + else: + clusterName=output.split()[0].split('SwitchName=')[1] elif len(stdout.split('\n')) == 2: clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] if clusterName.startswith("inactive-"): From 85144f5fb63e9757c526f3f76f9be1074a3626e3 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 2 Feb 2023 13:14:13 -0700 Subject: [PATCH 065/133] Fix GPG key for InfluxDB --- playbooks/roles/influxdb/tasks/el.yml | 2 +- playbooks/roles/influxdb/tasks/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/influxdb/tasks/el.yml b/playbooks/roles/influxdb/tasks/el.yml index edc56d0e..71a7a06a 100755 --- a/playbooks/roles/influxdb/tasks/el.yml +++ b/playbooks/roles/influxdb/tasks/el.yml @@ -29,7 +29,7 @@ baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable enabled: 1 gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdb.key + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - name: Install InfluxDB vars: diff --git a/playbooks/roles/influxdb/tasks/ubuntu.yml b/playbooks/roles/influxdb/tasks/ubuntu.yml index 42919848..cabb37ec 100644 --- a/playbooks/roles/influxdb/tasks/ubuntu.yml +++ b/playbooks/roles/influxdb/tasks/ubuntu.yml @@ -37,7 +37,7 @@ become: true apt_key: state: present - url: https://repos.influxdata.com/influxdb.key + url: https://repos.influxdata.com/influxdata-archive_compat.key - name: Manage InfluxData APT repositories become: true From ce907c20149c74d2715ca97bb2353f61896536c7 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Sun, 5 Feb 2023 15:13:51 -0800 Subject: [PATCH 066/133] updated README --- README.md | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1202187d..a992d277 100644 --- a/README.md +++ b/README.md @@ -324,19 +324,21 @@ $ max_nodes --include_cluster_names xxx yyy zzz --> where xxx, yyy, zzz are clus Use the alias "validate" to run the python script validation.py. You can run this script only from bastion. The script performs these checks. --> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. Also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion +-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. -> PCIe bandwidth check -> GPU Throttle check --> Standalone /etc/hosts md5 sum validation +-> Check whether md5 sum of /etc/hosts file on nodes matches that on bastion Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] -Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE]: [-cn CLUSTER_NAMES] -Provide a file that lists each cluster on a separate line for which you want to validate the number of nodes and/or pcie check and/or gpu throttle check. +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]: [-cn CLUSTER_NAMES] +Provide a file that lists each cluster on a separate line for which you want to validate the number of nodes and/or pcie check and/or gpu throttle check and/or /etc/hosts md5 sum. + +For pcie, gpu throttle, and /etc/hosts md5 sum check, you can either provide y or Y along with -cn or you can give the hostfile path (each host on a separate line) for each argument. For number of nodes check, either provide y or give y along with -cn. Below are some examples for running this script. -validate -n y --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. +validate -n y --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. validate -n y -cn --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. @@ -352,9 +354,13 @@ validate -g y -cn --> This will run the GPU throttle check. validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. +validate -e y --> This will run the /etc/hosts md5 sum check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. + +validate -e y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -e --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line. + You can combine all the options together such as: -validate -n y -p y -g y -cn +validate -n y -p y -g y -e y -cn -If you only want to run the md5 sum check (matches that of bastion) on some hosts, use the -e option. Example: -validate -e --> Check whether md5 sum of /etc/hosts file matches that on bastion. The hosts considered will be from the file provided. The host file should have a host name on each line. From 4385a4a941da3109b08222fe2580a2e4967f64be Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Sun, 5 Feb 2023 23:16:56 -0800 Subject: [PATCH 067/133] update GPG key for influxdb in telegraf role --- playbooks/roles/telegraf/tasks/el.yml | 2 +- playbooks/roles/telegraf/tasks/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/telegraf/tasks/el.yml b/playbooks/roles/telegraf/tasks/el.yml index ebacb8d0..9ad6e3af 100755 --- a/playbooks/roles/telegraf/tasks/el.yml +++ b/playbooks/roles/telegraf/tasks/el.yml @@ -7,7 +7,7 @@ baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable enabled: 1 gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdb.key + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - name: Install pip vars: diff --git a/playbooks/roles/telegraf/tasks/ubuntu.yml b/playbooks/roles/telegraf/tasks/ubuntu.yml index 38ee843c..4a7fbf89 100644 --- a/playbooks/roles/telegraf/tasks/ubuntu.yml +++ b/playbooks/roles/telegraf/tasks/ubuntu.yml @@ -3,7 +3,7 @@ become: true apt_key: state: present - url: https://repos.influxdata.com/influxdb.key + url: https://repos.influxdata.com/influxdata-archive_compat.key - name: Manage InfluxData APT repositories become: true From c5b50399b17e8cf90404f5aaa0c3da48d67114fd Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 6 Feb 2023 09:18:58 -0700 Subject: [PATCH 068/133] Add check in case topology was not correct --- .../autoscaling_mon/files/monitor_slurm.sh | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh b/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh index b8550e2e..51a661fb 100644 --- a/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh +++ b/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh @@ -56,12 +56,23 @@ def getClusterName(node): out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) stdout,stderr = out.communicate() clusterName = None - if len(stdout.split('\n')) > 2: - for output in stdout.split('\n')[:-1]: - if "Switches=" in output: - clusterName=output.split()[0].split('SwitchName=')[1] - elif len(stdout.split('\n')) == 2: - clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] + try: + if len(stdout.split('\n')) > 2: + for output in stdout.split('\n')[:-1]: + if "Switches=" in output: + clusterName=output.split()[0].split('SwitchName=')[1] + break + elif "SwitchName=inactive-" in output: + continue + else: + clusterName=output.split()[0].split('SwitchName=')[1] + elif len(stdout.split('\n')) == 2: + clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] + if clusterName.startswith("inactive-"): + return "NOCLUSTERFOUND" + except: + print('No ClusterName could be found for '+node) + return "NOCLUSTERFOUND" return clusterName #def getCPUsDetails(job): From 20db2185e8ca60ca86ec84d925f656481b98df9b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 7 Feb 2023 17:16:52 -0700 Subject: [PATCH 069/133] Add check if the nodes are already up in slurm --- .../roles/slurm/tasks/compute-rack-aware.yml | 54 ++++++++++++------ playbooks/roles/slurm/tasks/compute.yml | 55 ++++++++++++------- 2 files changed, 72 insertions(+), 37 deletions(-) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index e001b18c..3d8a23f0 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -252,23 +252,41 @@ state: restarted enabled: true -- name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - -- set_fact: - node_state2={{ node_state.stdout }} - name: Update node state on bastion - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file + block: + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + - set_fact: + node_state2: "{{ node_state.stdout }}" + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + rescue: + - name: Sleep 5 seconds + pause: + seconds: 10 + + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + until: node_state.stdout.find("failure") == -1 + retries: 10 + delay: 5 + + - set_fact: + node_state2: "{{ node_state.stdout }}" + + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + register: result + retries: 10 + delay: 5 + until: result is not failed + delegate_to: 127.0.0.1 \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 49febd49..a994c174 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -173,23 +173,40 @@ state: restarted enabled: true -- name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - -- set_fact: - node_state2={{ node_state.stdout }} - - name: Update node state on bastion - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file + block: + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + - set_fact: + node_state2: "{{ node_state.stdout }}" + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + rescue: + - name: Sleep 5 seconds + pause: + seconds: 10 + + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + until: node_state.stdout.find("failure") == -1 + retries: 10 + delay: 5 + + - set_fact: + node_state2: "{{ node_state.stdout }}" + + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + register: result + retries: 10 + delay: 5 + until: result is not failed + delegate_to: 127.0.0.1 \ No newline at end of file From 60467123b33cabb8781f4eff9d4d7411df235423 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 9 Feb 2023 13:01:46 -0800 Subject: [PATCH 070/133] consolidated influxdb installation, and removed some redundant code --- .../roles/autoscaling_mon/tasks/ubuntu.yml | 12 +- .../roles/influxdb/tasks/config_influxdb.yml | 69 +++++++++ playbooks/roles/influxdb/tasks/el.yml | 73 +--------- .../influxdb/tasks/el_install_influxdb.yml | 24 ++++ playbooks/roles/influxdb/tasks/ubuntu.yml | 86 +----------- .../tasks/ubuntu_install_influxdb.yml | 28 ++++ playbooks/roles/openldap/vars/debian_vars.yml | 1 - playbooks/roles/packages/tasks/centos-7.yml | 1 + playbooks/roles/packages/tasks/debian.yml | 1 + playbooks/roles/packages/tasks/el-7.yml | 1 + playbooks/roles/packages/tasks/ol-7.yml | 1 + playbooks/roles/packages/tasks/ubuntu.yml | 1 + playbooks/roles/rack-aware/tasks/el.yml | 2 +- playbooks/roles/rack-aware/tasks/ubuntu.yml | 23 ++- playbooks/roles/telegraf/tasks/common.yml | 53 +++++++ playbooks/roles/telegraf/tasks/el.yml | 127 +++++++---------- playbooks/roles/telegraf/tasks/main.yml | 10 +- playbooks/roles/telegraf/tasks/ubuntu.yml | 131 +++++++----------- playbooks/site.yml | 11 +- 19 files changed, 324 insertions(+), 331 deletions(-) create mode 100644 playbooks/roles/influxdb/tasks/config_influxdb.yml create mode 100644 playbooks/roles/influxdb/tasks/el_install_influxdb.yml create mode 100644 playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml create mode 100644 playbooks/roles/telegraf/tasks/common.yml diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index adb81837..0224d4bc 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -239,12 +239,12 @@ # overwrite: yes # path: files/cluster.json -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum +# - name: Install pip +# vars: +# package_name: +# - python3-pip +# include_role: +# name: safe_yum - name: install protobuf v3.19.4 and mysql connector become: true diff --git a/playbooks/roles/influxdb/tasks/config_influxdb.yml b/playbooks/roles/influxdb/tasks/config_influxdb.yml new file mode 100644 index 00000000..022ddcd9 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/config_influxdb.yml @@ -0,0 +1,69 @@ +--- +- name: Create /etc/opt/oci-hpc/passwords/influxdb + become: true + file: + path: /etc/opt/oci-hpc/passwords/influxdb + state: directory + owner: '{{ ansible_user }}' + mode: 0770 + group: '{{ ansible_user }}' + recurse: yes + +- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords + set_fact: + tmp_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +- name: Get influx password from /etc/opt/oci-hpc/passwords + set_fact: + influx_admin_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +# - name: Add influxdb repository +# become: true +# yum_repository: +# name: influxdb +# description: InfluxDB Repository - RHEL $releasever +# baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable +# enabled: 1 +# gpgcheck: 1 +# gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key + +# - name: Install InfluxDB +# vars: +# package_name: +# - influxdb +# package_state: latest +# include_role: +# name: safe_yum + +- name: Start InfluxDB + become: true + service: + name: influxdb + state: started + enabled: true + + +- name: Set configuration directory path + become: true + file: + path: "{{ influxdb_configuration_dir }}" + state: directory + +- name: Set templatized InfluxDB configuration + become: true + template: + src: influxdb.conf.j2 + dest: "{{ influxdb_configuration_dir }}/influxdb.conf" + force: yes + backup: yes + owner: influxdb + group: influxdb + mode: 0744 + register: influx_config + notify: restart influxdb + + diff --git a/playbooks/roles/influxdb/tasks/el.yml b/playbooks/roles/influxdb/tasks/el.yml index 71a7a06a..d8e45e5b 100755 --- a/playbooks/roles/influxdb/tasks/el.yml +++ b/playbooks/roles/influxdb/tasks/el.yml @@ -1,70 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Install InfluxDB - vars: - package_name: - - influxdb - - python-pip - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: el_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/el_install_influxdb.yml b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml new file mode 100644 index 00000000..1f3c0185 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml @@ -0,0 +1,24 @@ +--- +- name: Add influxdb repository + become: true + yum_repository: + name: influxdb + description: InfluxDB Repository - RHEL $releasever + baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable + enabled: 1 + gpgcheck: 1 + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu.yml b/playbooks/roles/influxdb/tasks/ubuntu.yml index cabb37ec..a4cf3be1 100644 --- a/playbooks/roles/influxdb/tasks/ubuntu.yml +++ b/playbooks/roles/influxdb/tasks/ubuntu.yml @@ -1,83 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -# - name: Add influxdb repository -# become: true -# apt_repository: -# repo: "deb [arch=amd64 signed-by=https://repos.influxdata.com/influxdb.key] https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable" -# state: present - # name: influxdb - # description: InfluxDB Repository - Debian - # baseurl: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - # enabled: 1 - # gpgcheck: 1 - # gpgkey: https://repos.influxdata.com/influxdb.key - -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install InfluxDB - vars: - package_name: - - influxdb - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: ubuntu_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml new file mode 100644 index 00000000..ef93e456 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml @@ -0,0 +1,28 @@ +--- +- name: Add InfluxData's key + become: true + apt_key: + state: present + url: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Manage InfluxData APT repositories + become: true + apt_repository: + repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable + state: present + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + vars: + ansible_python_interpreter: /usr/bin/python3 + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/openldap/vars/debian_vars.yml b/playbooks/roles/openldap/vars/debian_vars.yml index 724309b7..bb2fc0a6 100644 --- a/playbooks/roles/openldap/vars/debian_vars.yml +++ b/playbooks/roles/openldap/vars/debian_vars.yml @@ -5,7 +5,6 @@ openldap_packages: - slapd - ldap-utils - openssl - - python3-pip - libsasl2-dev - libldap2-dev - libssl-dev diff --git a/playbooks/roles/packages/tasks/centos-7.yml b/playbooks/roles/packages/tasks/centos-7.yml index 248d372d..30a8dace 100644 --- a/playbooks/roles/packages/tasks/centos-7.yml +++ b/playbooks/roles/packages/tasks/centos-7.yml @@ -6,6 +6,7 @@ - python2-cryptography - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/debian.yml b/playbooks/roles/packages/tasks/debian.yml index bd8c4991..d3911656 100644 --- a/playbooks/roles/packages/tasks/debian.yml +++ b/playbooks/roles/packages/tasks/debian.yml @@ -8,6 +8,7 @@ - pssh - pdsh - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml index d3bbd6e0..fb61f6a8 100755 --- a/playbooks/roles/packages/tasks/el-7.yml +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -7,6 +7,7 @@ - python3-oci-cli - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/ol-7.yml b/playbooks/roles/packages/tasks/ol-7.yml index cfd59817..f0d58a2a 100644 --- a/playbooks/roles/packages/tasks/ol-7.yml +++ b/playbooks/roles/packages/tasks/ol-7.yml @@ -7,6 +7,7 @@ - python36-oci-cli - pssh - pdsh + - python3-pip package_state: latest package_repo: "epel,ol7_developer_EPEL" include_role: diff --git a/playbooks/roles/packages/tasks/ubuntu.yml b/playbooks/roles/packages/tasks/ubuntu.yml index 26f1acbb..408e6075 100644 --- a/playbooks/roles/packages/tasks/ubuntu.yml +++ b/playbooks/roles/packages/tasks/ubuntu.yml @@ -10,6 +10,7 @@ - pdsh - python3-netaddr - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/rack-aware/tasks/el.yml b/playbooks/roles/rack-aware/tasks/el.yml index 1e68e989..adedeaa3 100644 --- a/playbooks/roles/rack-aware/tasks/el.yml +++ b/playbooks/roles/rack-aware/tasks/el.yml @@ -22,7 +22,7 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py copy: diff --git a/playbooks/roles/rack-aware/tasks/ubuntu.yml b/playbooks/roles/rack-aware/tasks/ubuntu.yml index 341a2ed5..b21dc0c7 100644 --- a/playbooks/roles/rack-aware/tasks/ubuntu.yml +++ b/playbooks/roles/rack-aware/tasks/ubuntu.yml @@ -1,21 +1,20 @@ -- name: Install Pip3 - vars: - package_name: - - python3-pip - package_state: latest - include_role: - name: safe_yum - ignore_errors: true +# - name: Install Pip3 +# vars: +# package_name: +# - python3-pip +# package_state: latest +# include_role: +# name: safe_yum +# ignore_errors: true - name: install pssh and parallel-ssh become: true + vars: + ansible_python_interpreter: /usr/bin/python3 pip: name: ['pssh', 'parallel-ssh'] executable: pip3 state: latest - with_items: - - pssh - - parallel-ssh ignore_errors: yes - name: Make sure /opt/oci-hpc/bin/ exists @@ -28,7 +27,7 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py copy: diff --git a/playbooks/roles/telegraf/tasks/common.yml b/playbooks/roles/telegraf/tasks/common.yml new file mode 100644 index 00000000..6e531449 --- /dev/null +++ b/playbooks/roles/telegraf/tasks/common.yml @@ -0,0 +1,53 @@ +--- +- name: Create database + shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" + +#- name: Create database +# influxdb_database: +# hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" +# database_name: "telegraf" +# run_once: true + +- name: Install telegraf + vars: + package_name: + - telegraf + package_state: latest + include_role: + name: safe_yum + +- name: copy telegraf.conf + become: true + copy: + src: "{{ item }}" + dest: /etc/telegraf/{{item}} + force: yes + backup: yes + owner: telegraf + group: telegraf + mode: 0744 + with_items: + - telegraf.conf + +- name: render conf files + become: true + template: + src: "{{ item }}.j2" + dest: /etc/telegraf/telegraf.d/{{item}} + force: yes + backup: yes + owner: telegraf + group: telegraf + mode: 0744 + with_items: + - infiniband.conf + - influxdb.conf + - net.conf + - infiniband_hw_counters.conf +- name: restart telegraf + become: true + service: + name: telegraf + state: restarted + enabled: yes + diff --git a/playbooks/roles/telegraf/tasks/el.yml b/playbooks/roles/telegraf/tasks/el.yml index 9ad6e3af..e08811df 100755 --- a/playbooks/roles/telegraf/tasks/el.yml +++ b/playbooks/roles/telegraf/tasks/el.yml @@ -1,76 +1,53 @@ ---- -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - pip: - name: influxdb - executable: pip3 - -- name: Create database - shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -#- name: Create database -# influxdb_database: -# hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# database_name: "telegraf" -# run_once: true - -- name: Install telegraf - vars: - package_name: - - telegraf - package_state: latest - include_role: - name: safe_yum - -- name: copy telegraf.conf - become: true - copy: - src: "{{ item }}" - dest: /etc/telegraf/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - telegraf.conf - -- name: render conf files - become: true - template: - src: "{{ item }}.j2" - dest: /etc/telegraf/telegraf.d/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - infiniband.conf - - influxdb.conf - - net.conf - - infiniband_hw_counters.conf -- name: restart telegraf - become: true - service: - name: telegraf - state: restarted - enabled: yes +# --- +# - name: Create database +# shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" + +# #- name: Create database +# # influxdb_database: +# # hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" +# # database_name: "telegraf" +# # run_once: true + +# - name: Install telegraf +# vars: +# package_name: +# - telegraf +# package_state: latest +# include_role: +# name: safe_yum + +# - name: copy telegraf.conf +# become: true +# copy: +# src: "{{ item }}" +# dest: /etc/telegraf/{{item}} +# force: yes +# backup: yes +# owner: telegraf +# group: telegraf +# mode: 0744 +# with_items: +# - telegraf.conf + +# - name: render conf files +# become: true +# template: +# src: "{{ item }}.j2" +# dest: /etc/telegraf/telegraf.d/{{item}} +# force: yes +# backup: yes +# owner: telegraf +# group: telegraf +# mode: 0744 +# with_items: +# - infiniband.conf +# - influxdb.conf +# - net.conf +# - infiniband_hw_counters.conf +# - name: restart telegraf +# become: true +# service: +# name: telegraf +# state: restarted +# enabled: yes diff --git a/playbooks/roles/telegraf/tasks/main.yml b/playbooks/roles/telegraf/tasks/main.yml index e3450c91..cfc6c338 100755 --- a/playbooks/roles/telegraf/tasks/main.yml +++ b/playbooks/roles/telegraf/tasks/main.yml @@ -1,4 +1,6 @@ -- include: el.yml - when: ansible_os_family == 'RedHat' -- include: ubuntu.yml - when: ansible_os_family == 'Debian' +# - include: el.yml +# when: ansible_os_family == 'RedHat' +# - include: ubuntu.yml +# when: ansible_os_family == 'Debian' +- include: common.yml + when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' diff --git a/playbooks/roles/telegraf/tasks/ubuntu.yml b/playbooks/roles/telegraf/tasks/ubuntu.yml index 4a7fbf89..e08811df 100644 --- a/playbooks/roles/telegraf/tasks/ubuntu.yml +++ b/playbooks/roles/telegraf/tasks/ubuntu.yml @@ -1,80 +1,53 @@ ---- -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - vars: - ansible_python_interpreter: /usr/bin/python3 - pip: - name: influxdb - executable: pip3 - -- name: Create database - shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -#- name: Create database -# influxdb_database: -# hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# database_name: "telegraf" -# run_once: true - -- name: Install telegraf - vars: - package_name: - - telegraf - package_state: latest - include_role: - name: safe_yum - -- name: copy telegraf.conf - become: true - copy: - src: "{{ item }}" - dest: /etc/telegraf/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - telegraf.conf - -- name: render conf files - become: true - template: - src: "{{ item }}.j2" - dest: /etc/telegraf/telegraf.d/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - infiniband.conf - - influxdb.conf - - net.conf - - infiniband_hw_counters.conf -- name: restart telegraf - become: true - service: - name: telegraf - state: restarted - enabled: yes +# --- +# - name: Create database +# shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" + +# #- name: Create database +# # influxdb_database: +# # hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" +# # database_name: "telegraf" +# # run_once: true + +# - name: Install telegraf +# vars: +# package_name: +# - telegraf +# package_state: latest +# include_role: +# name: safe_yum + +# - name: copy telegraf.conf +# become: true +# copy: +# src: "{{ item }}" +# dest: /etc/telegraf/{{item}} +# force: yes +# backup: yes +# owner: telegraf +# group: telegraf +# mode: 0744 +# with_items: +# - telegraf.conf + +# - name: render conf files +# become: true +# template: +# src: "{{ item }}.j2" +# dest: /etc/telegraf/telegraf.d/{{item}} +# force: yes +# backup: yes +# owner: telegraf +# group: telegraf +# mode: 0744 +# with_items: +# - infiniband.conf +# - influxdb.conf +# - net.conf +# - infiniband_hw_counters.conf +# - name: restart telegraf +# become: true +# service: +# name: telegraf +# state: restarted +# enabled: yes diff --git a/playbooks/site.yml b/playbooks/site.yml index f5f227f1..05f610c9 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -256,11 +256,17 @@ - include_role: name: yaml -- hosts: bastion +- hosts: all tasks: - include_role: name: influxdb when: monitoring|default(false)|bool + - include_role: + name: telegraf + when: monitoring|default(false)|bool + +- hosts: bastion + tasks: - include_role: name: grafana when: monitoring|default(false)|bool @@ -304,9 +310,6 @@ - include_role: name: spack when: spack|default(false)|bool - - include_role: - name: telegraf - when: monitoring|default(false)|bool - include_role: name: slurm when: slurm|default(false)|bool From f2bcf031b42af876cd509434a1f2d292d56b947f Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 9 Feb 2023 13:02:53 -0800 Subject: [PATCH 071/133] removing commented code --- playbooks/roles/rack-aware/tasks/ubuntu.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/playbooks/roles/rack-aware/tasks/ubuntu.yml b/playbooks/roles/rack-aware/tasks/ubuntu.yml index b21dc0c7..c80a4cbb 100644 --- a/playbooks/roles/rack-aware/tasks/ubuntu.yml +++ b/playbooks/roles/rack-aware/tasks/ubuntu.yml @@ -1,12 +1,3 @@ -# - name: Install Pip3 -# vars: -# package_name: -# - python3-pip -# package_state: latest -# include_role: -# name: safe_yum -# ignore_errors: true - - name: install pssh and parallel-ssh become: true vars: From e91690c1a7f5d864c82c2f8544e8dd2bcc537372 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 9 Feb 2023 15:25:29 -0800 Subject: [PATCH 072/133] updated validation script to include pcie for ubuntu --- README.md | 6 ------ bin/{pcie.sh => pcie_el.sh} | 0 bin/pcie_ubuntu.sh | 8 ++++++++ bin/validation.py | 25 +++++++++++++++++++++---- 4 files changed, 29 insertions(+), 10 deletions(-) rename bin/{pcie.sh => pcie_el.sh} (100%) create mode 100644 bin/pcie_ubuntu.sh diff --git a/README.md b/README.md index a992d277..b487add0 100644 --- a/README.md +++ b/README.md @@ -342,20 +342,14 @@ validate -n y --> This will validate that the number of nodes is consistent acro validate -n y -cn --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. -validate -p y --> This will run the pcie bandwidth check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -p y -cn --> This will run the pcie bandwidth check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -p --> This will run the pcie bandwidth check on the hosts provided in the file given. The pcie host file should have a host name on each line. -validate -g y --> This will run the GPU throttle check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -g y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. -validate -e y --> This will run the /etc/hosts md5 sum check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -e y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -e --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line. diff --git a/bin/pcie.sh b/bin/pcie_el.sh similarity index 100% rename from bin/pcie.sh rename to bin/pcie_el.sh diff --git a/bin/pcie_ubuntu.sh b/bin/pcie_ubuntu.sh new file mode 100644 index 00000000..95c5c456 --- /dev/null +++ b/bin/pcie_ubuntu.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` +do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: +done + diff --git a/bin/validation.py b/bin/validation.py index 0732aa5f..45b37ac2 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -11,7 +11,11 @@ # change ownership of all files to opc so that the files can be copied def changeOwner(path): - cmd = f'sudo chown -R opc:opc {path}' + out = subprocess.Popen(["whoami"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + username = stdout.split("\n") + del username[-1] + cmd = f'sudo chown -R {username[0]}:{username[0]} {path}' run_cmd(cmd) @@ -408,10 +412,23 @@ def inventoryNodes(metadata, cluster_names): def pcie_check(hostfile, path): - out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["cat /etc/os-release | grep PRETTY_NAME="],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() + os_name = stdout.split("\n") + del os_name[-1] + if "Linux" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_el.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + elif "Ubuntu" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_ubuntu.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + else: + print("Cannot run pcie check as OS is not determined to be Linux or Ubuntu") + def gpu_throttle(hostfile, path): out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) From dfe81c7066ee6c54f5108e93f64d3d801f5ff611 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 9 Feb 2023 15:33:20 -0800 Subject: [PATCH 073/133] deleted files that became obsolete for telegraf role and removed some commented code --- .../roles/influxdb/tasks/config_influxdb.yml | 18 ------- playbooks/roles/telegraf/tasks/el.yml | 53 ------------------- playbooks/roles/telegraf/tasks/main.yml | 4 -- playbooks/roles/telegraf/tasks/ubuntu.yml | 53 ------------------- 4 files changed, 128 deletions(-) delete mode 100755 playbooks/roles/telegraf/tasks/el.yml delete mode 100644 playbooks/roles/telegraf/tasks/ubuntu.yml diff --git a/playbooks/roles/influxdb/tasks/config_influxdb.yml b/playbooks/roles/influxdb/tasks/config_influxdb.yml index 022ddcd9..96d0ec86 100644 --- a/playbooks/roles/influxdb/tasks/config_influxdb.yml +++ b/playbooks/roles/influxdb/tasks/config_influxdb.yml @@ -21,24 +21,6 @@ '/etc/opt/oci-hpc/passwords/influxdb/root.txt chars=ascii_letters,digits,hexdigits') }}" -# - name: Add influxdb repository -# become: true -# yum_repository: -# name: influxdb -# description: InfluxDB Repository - RHEL $releasever -# baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable -# enabled: 1 -# gpgcheck: 1 -# gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - -# - name: Install InfluxDB -# vars: -# package_name: -# - influxdb -# package_state: latest -# include_role: -# name: safe_yum - - name: Start InfluxDB become: true service: diff --git a/playbooks/roles/telegraf/tasks/el.yml b/playbooks/roles/telegraf/tasks/el.yml deleted file mode 100755 index e08811df..00000000 --- a/playbooks/roles/telegraf/tasks/el.yml +++ /dev/null @@ -1,53 +0,0 @@ -# --- -# - name: Create database -# shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -# #- name: Create database -# # influxdb_database: -# # hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# # database_name: "telegraf" -# # run_once: true - -# - name: Install telegraf -# vars: -# package_name: -# - telegraf -# package_state: latest -# include_role: -# name: safe_yum - -# - name: copy telegraf.conf -# become: true -# copy: -# src: "{{ item }}" -# dest: /etc/telegraf/{{item}} -# force: yes -# backup: yes -# owner: telegraf -# group: telegraf -# mode: 0744 -# with_items: -# - telegraf.conf - -# - name: render conf files -# become: true -# template: -# src: "{{ item }}.j2" -# dest: /etc/telegraf/telegraf.d/{{item}} -# force: yes -# backup: yes -# owner: telegraf -# group: telegraf -# mode: 0744 -# with_items: -# - infiniband.conf -# - influxdb.conf -# - net.conf -# - infiniband_hw_counters.conf -# - name: restart telegraf -# become: true -# service: -# name: telegraf -# state: restarted -# enabled: yes - diff --git a/playbooks/roles/telegraf/tasks/main.yml b/playbooks/roles/telegraf/tasks/main.yml index cfc6c338..b1d4a1f1 100755 --- a/playbooks/roles/telegraf/tasks/main.yml +++ b/playbooks/roles/telegraf/tasks/main.yml @@ -1,6 +1,2 @@ -# - include: el.yml -# when: ansible_os_family == 'RedHat' -# - include: ubuntu.yml -# when: ansible_os_family == 'Debian' - include: common.yml when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' diff --git a/playbooks/roles/telegraf/tasks/ubuntu.yml b/playbooks/roles/telegraf/tasks/ubuntu.yml deleted file mode 100644 index e08811df..00000000 --- a/playbooks/roles/telegraf/tasks/ubuntu.yml +++ /dev/null @@ -1,53 +0,0 @@ -# --- -# - name: Create database -# shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -# #- name: Create database -# # influxdb_database: -# # hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# # database_name: "telegraf" -# # run_once: true - -# - name: Install telegraf -# vars: -# package_name: -# - telegraf -# package_state: latest -# include_role: -# name: safe_yum - -# - name: copy telegraf.conf -# become: true -# copy: -# src: "{{ item }}" -# dest: /etc/telegraf/{{item}} -# force: yes -# backup: yes -# owner: telegraf -# group: telegraf -# mode: 0744 -# with_items: -# - telegraf.conf - -# - name: render conf files -# become: true -# template: -# src: "{{ item }}.j2" -# dest: /etc/telegraf/telegraf.d/{{item}} -# force: yes -# backup: yes -# owner: telegraf -# group: telegraf -# mode: 0744 -# with_items: -# - infiniband.conf -# - influxdb.conf -# - net.conf -# - infiniband_hw_counters.conf -# - name: restart telegraf -# become: true -# service: -# name: telegraf -# state: restarted -# enabled: yes - From e318a770580bc9ee59e2c1b73f596cc77ebdde3b Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 13 Feb 2023 11:33:46 -0800 Subject: [PATCH 074/133] removed tags --- playbooks/roles/mpi-hostfiles/tasks/common.yml | 2 -- playbooks/roles/mysql/tasks/centos.yml | 1 - playbooks/roles/mysql/tasks/debian.yml | 1 - playbooks/roles/mysql/tasks/el.yml | 1 - playbooks/roles/nfs-client/tasks/debian.yml | 1 - playbooks/roles/nfs-client/tasks/el.yml | 1 - playbooks/roles/nfs-client/tasks/ubuntu.yml | 1 - playbooks/roles/openldap/tasks/debian.yml | 9 --------- playbooks/roles/openldap/tasks/el-7.yml | 2 -- 9 files changed, 19 deletions(-) diff --git a/playbooks/roles/mpi-hostfiles/tasks/common.yml b/playbooks/roles/mpi-hostfiles/tasks/common.yml index ec1f8330..a713c88a 100644 --- a/playbooks/roles/mpi-hostfiles/tasks/common.yml +++ b/playbooks/roles/mpi-hostfiles/tasks/common.yml @@ -12,7 +12,6 @@ mode: '0644' owner: "{{ ansible_user }}" group: "{{ ansible_user }}" - tags: hostfile delegate_to: 127.0.0.1 run_once: true when: cluster_network|bool @@ -24,7 +23,6 @@ mode: '0644' owner: "{{ ansible_user }}" group: "{{ ansible_user }}" - tags: hostfile delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/mysql/tasks/centos.yml b/playbooks/roles/mysql/tasks/centos.yml index 3b4720cb..710c4a81 100644 --- a/playbooks/roles/mysql/tasks/centos.yml +++ b/playbooks/roles/mysql/tasks/centos.yml @@ -29,7 +29,6 @@ package_name: '{{ mariadb_packages }}' include_role: name: safe_yum - tags: install-only - name: Update SELinux context for {{ mysql_db_path }} diff --git a/playbooks/roles/mysql/tasks/debian.yml b/playbooks/roles/mysql/tasks/debian.yml index 98eb655c..9d4b47be 100644 --- a/playbooks/roles/mysql/tasks/debian.yml +++ b/playbooks/roles/mysql/tasks/debian.yml @@ -34,7 +34,6 @@ package_name: '{{ deb_mariadb_packages }}' include_role: name: safe_yum - tags: install-only - name: Ensure {{ mysql_db_path }} exists become: true diff --git a/playbooks/roles/mysql/tasks/el.yml b/playbooks/roles/mysql/tasks/el.yml index ea71c748..d893abb8 100644 --- a/playbooks/roles/mysql/tasks/el.yml +++ b/playbooks/roles/mysql/tasks/el.yml @@ -36,7 +36,6 @@ package_repo: ol7_MySQL80 include_role: name: safe_yum - tags: install-only - name: Update SELinux context for {{ mysql_db_path }} become: true diff --git a/playbooks/roles/nfs-client/tasks/debian.yml b/playbooks/roles/nfs-client/tasks/debian.yml index 4d8ed999..6d6a84f7 100644 --- a/playbooks/roles/nfs-client/tasks/debian.yml +++ b/playbooks/roles/nfs-client/tasks/debian.yml @@ -3,7 +3,6 @@ ansible.builtin.package: name: '{{ deb_packages }}' state: present - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/nfs-client/tasks/el.yml b/playbooks/roles/nfs-client/tasks/el.yml index cdcd5936..944d9fc2 100755 --- a/playbooks/roles/nfs-client/tasks/el.yml +++ b/playbooks/roles/nfs-client/tasks/el.yml @@ -4,7 +4,6 @@ package_name: '{{ nfs_rpm_packages }}' include_role: name: safe_yum - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/nfs-client/tasks/ubuntu.yml b/playbooks/roles/nfs-client/tasks/ubuntu.yml index a2af7e1c..e512a800 100644 --- a/playbooks/roles/nfs-client/tasks/ubuntu.yml +++ b/playbooks/roles/nfs-client/tasks/ubuntu.yml @@ -3,7 +3,6 @@ ansible.builtin.package: name: "{{ nfs_deb_packages }}" state: present - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 2c24d2e0..5234df9a 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -6,27 +6,18 @@ name: apparmor state: stopped failed_when: false - tags: - - configuration - - apparmor - name: Remove Apparmor service service: name: apparmor enabled: false failed_when: false - tags: - - configuration - - apparmor - name: Remove Apparmor package apt: name: apparmor state: absent purge: true - tags: - - configuration - - apparmor - name: Create /etc/opt/oci-hpc/passwords/openldap become: true diff --git a/playbooks/roles/openldap/tasks/el-7.yml b/playbooks/roles/openldap/tasks/el-7.yml index e8b31246..3f55faac 100644 --- a/playbooks/roles/openldap/tasks/el-7.yml +++ b/playbooks/roles/openldap/tasks/el-7.yml @@ -18,7 +18,6 @@ package_state: present include_role: name: safe_yum - tags: install-only - name: Generate openldap root password set_fact: @@ -37,7 +36,6 @@ package_name: "{{openldap_packages}}" include_role: name: safe_yum - tags: install-only - block: - name: Selinux fcontext on files From 70aaa257440b6c5cf548903d0aab663403e4b547 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 13 Feb 2023 23:20:46 -0700 Subject: [PATCH 075/133] Fix for login marketplace image without login node --- locals.tf | 2 +- marketplace.tf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/locals.tf b/locals.tf index dc3b2d87..50faf49f 100755 --- a/locals.tf +++ b/locals.tf @@ -30,7 +30,7 @@ locals { bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid - login_image = var.use_standard_image_login || var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + login_image = var.login_node || var.use_standard_image_login || var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid diff --git a/marketplace.tf b/marketplace.tf index 0c9d6bc7..7f550815 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -77,12 +77,12 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { - count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 listing_id = local.mp_login_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { - count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id @@ -90,7 +90,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_ima } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { - count = var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id From d29ee51255049708c0cbb7ad410ef2a02d2c5fe2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 14 Feb 2023 09:29:00 -0700 Subject: [PATCH 076/133] Merge branch '2.10.1' of https://github.com/oci-hpc/oci-hpc-clusternetwork-dev into 2.10.1 --- README.md | 6 -- bin/{pcie.sh => pcie_el.sh} | 0 bin/pcie_ubuntu.sh | 8 ++ bin/validation.py | 25 +++++- .../roles/autoscaling_mon/tasks/ubuntu.yml | 12 +-- .../roles/influxdb/tasks/config_influxdb.yml | 51 +++++++++++ playbooks/roles/influxdb/tasks/el.yml | 73 ++-------------- .../influxdb/tasks/el_install_influxdb.yml | 24 ++++++ playbooks/roles/influxdb/tasks/ubuntu.yml | 86 ++----------------- .../tasks/ubuntu_install_influxdb.yml | 28 ++++++ playbooks/roles/openldap/vars/debian_vars.yml | 1 - playbooks/roles/packages/tasks/centos-7.yml | 1 + playbooks/roles/packages/tasks/debian.yml | 1 + playbooks/roles/packages/tasks/el-7.yml | 1 + playbooks/roles/packages/tasks/ol-7.yml | 1 + playbooks/roles/packages/tasks/ubuntu.yml | 1 + playbooks/roles/rack-aware/tasks/el.yml | 2 +- playbooks/roles/rack-aware/tasks/ubuntu.yml | 16 +--- .../telegraf/tasks/{el.yml => common.yml} | 23 ----- playbooks/roles/telegraf/tasks/main.yml | 6 +- playbooks/roles/telegraf/tasks/ubuntu.yml | 80 ----------------- playbooks/site.yml | 11 ++- 22 files changed, 166 insertions(+), 291 deletions(-) rename bin/{pcie.sh => pcie_el.sh} (100%) create mode 100644 bin/pcie_ubuntu.sh create mode 100644 playbooks/roles/influxdb/tasks/config_influxdb.yml create mode 100644 playbooks/roles/influxdb/tasks/el_install_influxdb.yml create mode 100644 playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml rename playbooks/roles/telegraf/tasks/{el.yml => common.yml} (67%) mode change 100755 => 100644 delete mode 100644 playbooks/roles/telegraf/tasks/ubuntu.yml diff --git a/README.md b/README.md index a992d277..b487add0 100644 --- a/README.md +++ b/README.md @@ -342,20 +342,14 @@ validate -n y --> This will validate that the number of nodes is consistent acro validate -n y -cn --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. -validate -p y --> This will run the pcie bandwidth check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -p y -cn --> This will run the pcie bandwidth check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -p --> This will run the pcie bandwidth check on the hosts provided in the file given. The pcie host file should have a host name on each line. -validate -g y --> This will run the GPU throttle check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -g y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. -validate -e y --> This will run the /etc/hosts md5 sum check. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. - validate -e y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -e --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line. diff --git a/bin/pcie.sh b/bin/pcie_el.sh similarity index 100% rename from bin/pcie.sh rename to bin/pcie_el.sh diff --git a/bin/pcie_ubuntu.sh b/bin/pcie_ubuntu.sh new file mode 100644 index 00000000..95c5c456 --- /dev/null +++ b/bin/pcie_ubuntu.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` +do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: +done + diff --git a/bin/validation.py b/bin/validation.py index 0732aa5f..45b37ac2 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -11,7 +11,11 @@ # change ownership of all files to opc so that the files can be copied def changeOwner(path): - cmd = f'sudo chown -R opc:opc {path}' + out = subprocess.Popen(["whoami"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + username = stdout.split("\n") + del username[-1] + cmd = f'sudo chown -R {username[0]}:{username[0]} {path}' run_cmd(cmd) @@ -408,10 +412,23 @@ def inventoryNodes(metadata, cluster_names): def pcie_check(hostfile, path): - out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["cat /etc/os-release | grep PRETTY_NAME="],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() + os_name = stdout.split("\n") + del os_name[-1] + if "Linux" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_el.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + elif "Ubuntu" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_ubuntu.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + else: + print("Cannot run pcie check as OS is not determined to be Linux or Ubuntu") + def gpu_throttle(hostfile, path): out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index adb81837..0224d4bc 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -239,12 +239,12 @@ # overwrite: yes # path: files/cluster.json -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum +# - name: Install pip +# vars: +# package_name: +# - python3-pip +# include_role: +# name: safe_yum - name: install protobuf v3.19.4 and mysql connector become: true diff --git a/playbooks/roles/influxdb/tasks/config_influxdb.yml b/playbooks/roles/influxdb/tasks/config_influxdb.yml new file mode 100644 index 00000000..96d0ec86 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/config_influxdb.yml @@ -0,0 +1,51 @@ +--- +- name: Create /etc/opt/oci-hpc/passwords/influxdb + become: true + file: + path: /etc/opt/oci-hpc/passwords/influxdb + state: directory + owner: '{{ ansible_user }}' + mode: 0770 + group: '{{ ansible_user }}' + recurse: yes + +- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords + set_fact: + tmp_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +- name: Get influx password from /etc/opt/oci-hpc/passwords + set_fact: + influx_admin_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +- name: Start InfluxDB + become: true + service: + name: influxdb + state: started + enabled: true + + +- name: Set configuration directory path + become: true + file: + path: "{{ influxdb_configuration_dir }}" + state: directory + +- name: Set templatized InfluxDB configuration + become: true + template: + src: influxdb.conf.j2 + dest: "{{ influxdb_configuration_dir }}/influxdb.conf" + force: yes + backup: yes + owner: influxdb + group: influxdb + mode: 0744 + register: influx_config + notify: restart influxdb + + diff --git a/playbooks/roles/influxdb/tasks/el.yml b/playbooks/roles/influxdb/tasks/el.yml index 71a7a06a..d8e45e5b 100755 --- a/playbooks/roles/influxdb/tasks/el.yml +++ b/playbooks/roles/influxdb/tasks/el.yml @@ -1,70 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Install InfluxDB - vars: - package_name: - - influxdb - - python-pip - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: el_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/el_install_influxdb.yml b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml new file mode 100644 index 00000000..1f3c0185 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml @@ -0,0 +1,24 @@ +--- +- name: Add influxdb repository + become: true + yum_repository: + name: influxdb + description: InfluxDB Repository - RHEL $releasever + baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable + enabled: 1 + gpgcheck: 1 + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu.yml b/playbooks/roles/influxdb/tasks/ubuntu.yml index cabb37ec..a4cf3be1 100644 --- a/playbooks/roles/influxdb/tasks/ubuntu.yml +++ b/playbooks/roles/influxdb/tasks/ubuntu.yml @@ -1,83 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -# - name: Add influxdb repository -# become: true -# apt_repository: -# repo: "deb [arch=amd64 signed-by=https://repos.influxdata.com/influxdb.key] https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable" -# state: present - # name: influxdb - # description: InfluxDB Repository - Debian - # baseurl: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - # enabled: 1 - # gpgcheck: 1 - # gpgkey: https://repos.influxdata.com/influxdb.key - -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install InfluxDB - vars: - package_name: - - influxdb - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: ubuntu_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml new file mode 100644 index 00000000..ef93e456 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml @@ -0,0 +1,28 @@ +--- +- name: Add InfluxData's key + become: true + apt_key: + state: present + url: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Manage InfluxData APT repositories + become: true + apt_repository: + repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable + state: present + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + vars: + ansible_python_interpreter: /usr/bin/python3 + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/openldap/vars/debian_vars.yml b/playbooks/roles/openldap/vars/debian_vars.yml index 724309b7..bb2fc0a6 100644 --- a/playbooks/roles/openldap/vars/debian_vars.yml +++ b/playbooks/roles/openldap/vars/debian_vars.yml @@ -5,7 +5,6 @@ openldap_packages: - slapd - ldap-utils - openssl - - python3-pip - libsasl2-dev - libldap2-dev - libssl-dev diff --git a/playbooks/roles/packages/tasks/centos-7.yml b/playbooks/roles/packages/tasks/centos-7.yml index 248d372d..30a8dace 100644 --- a/playbooks/roles/packages/tasks/centos-7.yml +++ b/playbooks/roles/packages/tasks/centos-7.yml @@ -6,6 +6,7 @@ - python2-cryptography - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/debian.yml b/playbooks/roles/packages/tasks/debian.yml index bd8c4991..d3911656 100644 --- a/playbooks/roles/packages/tasks/debian.yml +++ b/playbooks/roles/packages/tasks/debian.yml @@ -8,6 +8,7 @@ - pssh - pdsh - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml index d3bbd6e0..fb61f6a8 100755 --- a/playbooks/roles/packages/tasks/el-7.yml +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -7,6 +7,7 @@ - python3-oci-cli - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/ol-7.yml b/playbooks/roles/packages/tasks/ol-7.yml index cfd59817..f0d58a2a 100644 --- a/playbooks/roles/packages/tasks/ol-7.yml +++ b/playbooks/roles/packages/tasks/ol-7.yml @@ -7,6 +7,7 @@ - python36-oci-cli - pssh - pdsh + - python3-pip package_state: latest package_repo: "epel,ol7_developer_EPEL" include_role: diff --git a/playbooks/roles/packages/tasks/ubuntu.yml b/playbooks/roles/packages/tasks/ubuntu.yml index 26f1acbb..408e6075 100644 --- a/playbooks/roles/packages/tasks/ubuntu.yml +++ b/playbooks/roles/packages/tasks/ubuntu.yml @@ -10,6 +10,7 @@ - pdsh - python3-netaddr - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/rack-aware/tasks/el.yml b/playbooks/roles/rack-aware/tasks/el.yml index 1e68e989..adedeaa3 100644 --- a/playbooks/roles/rack-aware/tasks/el.yml +++ b/playbooks/roles/rack-aware/tasks/el.yml @@ -22,7 +22,7 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py copy: diff --git a/playbooks/roles/rack-aware/tasks/ubuntu.yml b/playbooks/roles/rack-aware/tasks/ubuntu.yml index 341a2ed5..c80a4cbb 100644 --- a/playbooks/roles/rack-aware/tasks/ubuntu.yml +++ b/playbooks/roles/rack-aware/tasks/ubuntu.yml @@ -1,21 +1,11 @@ -- name: Install Pip3 - vars: - package_name: - - python3-pip - package_state: latest - include_role: - name: safe_yum - ignore_errors: true - - name: install pssh and parallel-ssh become: true + vars: + ansible_python_interpreter: /usr/bin/python3 pip: name: ['pssh', 'parallel-ssh'] executable: pip3 state: latest - with_items: - - pssh - - parallel-ssh ignore_errors: yes - name: Make sure /opt/oci-hpc/bin/ exists @@ -28,7 +18,7 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py copy: diff --git a/playbooks/roles/telegraf/tasks/el.yml b/playbooks/roles/telegraf/tasks/common.yml old mode 100755 new mode 100644 similarity index 67% rename from playbooks/roles/telegraf/tasks/el.yml rename to playbooks/roles/telegraf/tasks/common.yml index 9ad6e3af..6e531449 --- a/playbooks/roles/telegraf/tasks/el.yml +++ b/playbooks/roles/telegraf/tasks/common.yml @@ -1,27 +1,4 @@ --- -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - pip: - name: influxdb - executable: pip3 - - name: Create database shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" diff --git a/playbooks/roles/telegraf/tasks/main.yml b/playbooks/roles/telegraf/tasks/main.yml index e3450c91..b1d4a1f1 100755 --- a/playbooks/roles/telegraf/tasks/main.yml +++ b/playbooks/roles/telegraf/tasks/main.yml @@ -1,4 +1,2 @@ -- include: el.yml - when: ansible_os_family == 'RedHat' -- include: ubuntu.yml - when: ansible_os_family == 'Debian' +- include: common.yml + when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' diff --git a/playbooks/roles/telegraf/tasks/ubuntu.yml b/playbooks/roles/telegraf/tasks/ubuntu.yml deleted file mode 100644 index 4a7fbf89..00000000 --- a/playbooks/roles/telegraf/tasks/ubuntu.yml +++ /dev/null @@ -1,80 +0,0 @@ ---- -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdata-archive_compat.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - vars: - ansible_python_interpreter: /usr/bin/python3 - pip: - name: influxdb - executable: pip3 - -- name: Create database - shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -#- name: Create database -# influxdb_database: -# hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# database_name: "telegraf" -# run_once: true - -- name: Install telegraf - vars: - package_name: - - telegraf - package_state: latest - include_role: - name: safe_yum - -- name: copy telegraf.conf - become: true - copy: - src: "{{ item }}" - dest: /etc/telegraf/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - telegraf.conf - -- name: render conf files - become: true - template: - src: "{{ item }}.j2" - dest: /etc/telegraf/telegraf.d/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - infiniband.conf - - influxdb.conf - - net.conf - - infiniband_hw_counters.conf -- name: restart telegraf - become: true - service: - name: telegraf - state: restarted - enabled: yes - diff --git a/playbooks/site.yml b/playbooks/site.yml index f5f227f1..05f610c9 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -256,11 +256,17 @@ - include_role: name: yaml -- hosts: bastion +- hosts: all tasks: - include_role: name: influxdb when: monitoring|default(false)|bool + - include_role: + name: telegraf + when: monitoring|default(false)|bool + +- hosts: bastion + tasks: - include_role: name: grafana when: monitoring|default(false)|bool @@ -304,9 +310,6 @@ - include_role: name: spack when: spack|default(false)|bool - - include_role: - name: telegraf - when: monitoring|default(false)|bool - include_role: name: slurm when: slurm|default(false)|bool From deb36af17e67c75ea03693a365c8c7beb08aead9 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 14 Feb 2023 09:30:27 -0700 Subject: [PATCH 077/133] Fix marketplace issue with no login node --- locals.tf | 2 +- marketplace.tf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/locals.tf b/locals.tf index 50faf49f..a7f83527 100755 --- a/locals.tf +++ b/locals.tf @@ -30,7 +30,7 @@ locals { bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid - login_image = var.login_node || var.use_standard_image_login || var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + login_image = var.login_node && ( var.use_standard_image_login || var.use_marketplace_image_login ) ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid diff --git a/marketplace.tf b/marketplace.tf index 7f550815..5917390b 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -77,12 +77,12 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { - count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 listing_id = local.mp_login_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { - count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id @@ -90,7 +90,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_ima } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { - count = var.login_node || var.use_marketplace_image_login || var.use_standard_image_login ? 1 : 0 + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id From 0eebe7f4a193710d13274a677e937a7ce0eee6eb Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 10:06:04 -0700 Subject: [PATCH 078/133] Point to the new marketplace images --- conf/variables.tpl | 9 ++++++--- variables.tf | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index c5d7e9e7..5523864e 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -47,9 +47,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" + "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" + "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" } } diff --git a/variables.tf b/variables.tf index 73b3cba2..1fc5f40b 100755 --- a/variables.tf +++ b/variables.tf @@ -86,9 +86,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" + "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" + "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" } } From 15912c7d5c0376fbe2853d2ede900658ffbfcdcc Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 10:06:30 -0700 Subject: [PATCH 079/133] Add some info on supported OS --- schema.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/schema.yaml b/schema.yaml index 13b0ea51..b0f81875 100755 --- a/schema.yaml +++ b/schema.yaml @@ -384,7 +384,7 @@ variables: unsupported_bastion_image: title: "Image OCID" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment." type: string required: true visible: @@ -408,7 +408,7 @@ variables: custom_bastion_image: title: "Bastion Image ID" - description: "Custom image ID for bastion nodes. Please note that only Oracle Linux and Ubuntu 20.04 are supported as bastion image at this moment. " + description: "Custom image ID for bastion nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment. " type: oci:core:image:id dependsOn: compartmentId: ${bastion_image_compartment} @@ -706,7 +706,7 @@ variables: image: title: "Image" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" type: oci:core:image:id required: true dependsOn: @@ -740,7 +740,7 @@ variables: image_ocid: title: "Image OCID" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" type: string required: true visible: @@ -1027,7 +1027,7 @@ variables: type: boolean title: "Enable PAM" default: false - description: "Enable PAM for the Slurm cluster. When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job in Slurm." + description: "Enable PAM for the Slurm cluster (Supported only on OL with RHCK kernel at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." monitoring: type: boolean From c23da8a7647111ec049d6041b95919f00913e8bf Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 10:06:47 -0700 Subject: [PATCH 080/133] Use safe Yum for nfs-util package --- playbooks/roles/nfs-server/tasks/el.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/nfs-server/tasks/el.yml b/playbooks/roles/nfs-server/tasks/el.yml index cdc409f6..70f5d39f 100755 --- a/playbooks/roles/nfs-server/tasks/el.yml +++ b/playbooks/roles/nfs-server/tasks/el.yml @@ -1,6 +1,12 @@ --- - name: Ensure NFS utilities are installed. - package: name=nfs-utils state=present + vars: + package_name: + - nfs-utils + package_state: present + include_role: + name: safe_yum + ignore_errors: true - name: Start NFS server service: From db1a1837de46c61c2f1aa5709761dc7f52c38e23 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 10:07:13 -0700 Subject: [PATCH 081/133] Add packages role for OL8 --- playbooks/roles/packages/tasks/main.yml | 3 +++ playbooks/roles/packages/tasks/ol-8.yml | 15 +++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 playbooks/roles/packages/tasks/ol-8.yml diff --git a/playbooks/roles/packages/tasks/main.yml b/playbooks/roles/packages/tasks/main.yml index 5774423d..275cffe6 100755 --- a/playbooks/roles/packages/tasks/main.yml +++ b/playbooks/roles/packages/tasks/main.yml @@ -1,6 +1,9 @@ - include: ol-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' +- include: ol-8.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '8' + - include: centos-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' diff --git a/playbooks/roles/packages/tasks/ol-8.yml b/playbooks/roles/packages/tasks/ol-8.yml new file mode 100644 index 00000000..61607e48 --- /dev/null +++ b/playbooks/roles/packages/tasks/ol-8.yml @@ -0,0 +1,15 @@ +--- +- name: Make sure python OpenSSL and parallel ssh is installed + vars: + package_name: + #- pyOpenSSL + #- python2-cryptography + - python36-oci-cli + - pssh + - pdsh + - python3-pip + package_state: latest + package_repo: "epel,ol8_developer_EPEL" + include_role: + name: safe_yum + ignore_errors: true From 20511bfeec721abc7f4b2cc9cc3dd51b0da207e8 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 13:51:29 -0700 Subject: [PATCH 082/133] Add sacct limits as an option --- autoscaling/tf_init/bastion_update.tf | 3 ++- autoscaling/tf_init/inventory.tpl | 1 + bastion.tf | 4 +++- conf/variables.tpl | 1 + inventory.tpl | 1 + playbooks/roles/slurm/files/cgroup.conf | 4 +++- playbooks/roles/slurm/tasks/login.yml | 4 ++-- playbooks/roles/slurm/templates/gres.conf.j2 | 2 +- playbooks/roles/slurm/templates/slurm.conf.j2 | 12 ++++++++++++ playbooks/roles/slurm/templates/slurmdbd.conf.j2 | 3 +++ playbooks/slurm_config.yml | 4 ++-- schema.yaml | 7 +++++++ variables.tf | 1 + 13 files changed, 39 insertions(+), 8 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index b574bac4..f4dd0faf 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -66,7 +66,8 @@ resource "local_file" "inventory" { latency_check = var.latency_check bastion_username = var.bastion_username, compute_username = var.compute_username, - pam = var.pam + pam = var.pam, + sacct_limits = var.sacct_limits }) filename = "${local.bastion_path}/inventory" } diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index c13e6c65..c02b0548 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -65,3 +65,4 @@ latency_check=${latency_check} compute_username=${compute_username} bastion_username=${bastion_username} pam = ${pam} +sacct_limits=${sacct_limits} \ No newline at end of file diff --git a/bastion.tf b/bastion.tf index 9442aa21..65e29d05 100644 --- a/bastion.tf +++ b/bastion.tf @@ -252,6 +252,7 @@ resource "null_resource" "cluster" { privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, pam = var.pam, + sacct_limits = var.sacct_limits, inst_prin = var.inst_prin, region = var.region, tenancy_ocid = var.tenancy_ocid, @@ -388,7 +389,8 @@ resource "null_resource" "cluster" { use_multiple_ads = var.use_multiple_ads, bastion_username = var.bastion_username, compute_username = var.compute_username, - pam = var.pam + pam = var.pam, + sacct_limits = var.sacct_limits }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/conf/variables.tpl b/conf/variables.tpl index 5523864e..2744d019 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -28,6 +28,7 @@ variable "slurm" { default = ${slurm} } variable "rack_aware" { default = ${rack_aware} } variable "pyxis" { default = ${pyxis} } variable "pam" { default = ${pam} } +variable "sacct_limits" { default = ${sacct_limits} } variable "enroot" { default = ${enroot} } variable "slurm_nfs_path" { default = "${slurm_nfs_path}" } variable "spack" { default = ${spack} } diff --git a/inventory.tpl b/inventory.tpl index 4997f592..3134339a 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -71,4 +71,5 @@ tenancy_ocid = ${tenancy_ocid} inst_prin = ${inst_prin} api_fingerprint = ${api_fingerprint} api_user_ocid = ${api_user_ocid} +sacct_limits=${sacct_limits} diff --git a/playbooks/roles/slurm/files/cgroup.conf b/playbooks/roles/slurm/files/cgroup.conf index d7035a99..57b5c5a2 100755 --- a/playbooks/roles/slurm/files/cgroup.conf +++ b/playbooks/roles/slurm/files/cgroup.conf @@ -1,2 +1,4 @@ +CgroupMountpoint="/sys/fs/cgroup" CgroupAutomount=yes -ConstrainDevices=yes \ No newline at end of file +ConstrainDevices=yes +ConstrainCores=yes \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/login.yml b/playbooks/roles/slurm/tasks/login.yml index 2a6c74ab..48998e34 100755 --- a/playbooks/roles/slurm/tasks/login.yml +++ b/playbooks/roles/slurm/tasks/login.yml @@ -7,7 +7,7 @@ include_role: name: safe_yum -- name: Render systemd units for slurm, slurmdbd and munge +- name: Render systemd units for slurmd become: true template: src: 'systemd/{{ item }}.service' @@ -17,7 +17,7 @@ - slurmd when: ansible_os_family == 'Debian' -- name: Create systemd unit dirs +- name: Create systemd unit dirs for slurmd and munge become: true file: name: '/etc/systemd/system/{{ item }}.service.d' diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index a18b5bfb..1356a2ae 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -1,7 +1,7 @@ {% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} {% for partition in queues %} {% for instance in partition.instance_types %} - +AutoDetect=nvml {% if instance.shape == "BM.GPU2.2"%} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 83bdc4d6..53856139 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -46,6 +46,18 @@ TopologyPlugin=topology/tree TreeWidth=2048 SlurmctldParameters=enable_configless +{% if sacct_limits|bool %} +AccountingStorageTRES=gres/gpu +AccountingStorageEnforce=limits,associations,qos,safe +JobCompType=jobcomp/none +TrackWckey=no +{% endif %} + + +{% if (groups['login']| length ) > 0 %} +NodeName={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} +{% endif %} + {% for partition in queues %} {% for instance in partition.instance_types %} {% set size = instance.private_subnet | ipaddr('size')%} diff --git a/playbooks/roles/slurm/templates/slurmdbd.conf.j2 b/playbooks/roles/slurm/templates/slurmdbd.conf.j2 index 66f49698..cfa7606f 100755 --- a/playbooks/roles/slurm/templates/slurmdbd.conf.j2 +++ b/playbooks/roles/slurm/templates/slurmdbd.conf.j2 @@ -23,3 +23,6 @@ StoragePass={{ slurmdbd_sql_pwd }} StorageUser={{ slurm_db_user }} StorageLoc={{ slurm_db_name }} +{% if sacct_limits|bool %} +TrackWckey=no +{% endif %} \ No newline at end of file diff --git a/playbooks/slurm_config.yml b/playbooks/slurm_config.yml index bc2da813..bb3f6995 100755 --- a/playbooks/slurm_config.yml +++ b/playbooks/slurm_config.yml @@ -1,9 +1,9 @@ -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,compute,login gather_facts: true vars: destroy: false initial: true - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" diff --git a/schema.yaml b/schema.yaml index b0f81875..b94f4796 100755 --- a/schema.yaml +++ b/schema.yaml @@ -165,6 +165,7 @@ variableGroups: - ${monitoring} - ${enroot} - ${pam} + - ${sacct_limits} - title: "Hidden" variables: @@ -1029,6 +1030,12 @@ variables: default: false description: "Enable PAM for the Slurm cluster (Supported only on OL with RHCK kernel at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." + sacct_limits: + type: boolean + title: "Enable Limits for SLurm jobs" + default: false + description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs of the right limits are not set" + monitoring: type: boolean title: "Install HPC Cluster Monitoring Tools" diff --git a/variables.tf b/variables.tf index 1fc5f40b..d0774658 100755 --- a/variables.tf +++ b/variables.tf @@ -174,6 +174,7 @@ variable "monitoring" { default = true } variable "enroot" { default = false } variable "pyxis" { default = false } variable "pam" { default = false } +variable "sacct_limits" { default = false } variable "unsupported" { type=bool From 4e8f2baa019952877d68fb9f32dff9b4658dfc32 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 14:35:09 -0700 Subject: [PATCH 083/133] Add sacct_limits to slurm backup --- slurm_ha.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/slurm_ha.tf b/slurm_ha.tf index 388e76ec..30f77021 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -243,6 +243,7 @@ resource "null_resource" "cluster_backup" { enroot = var.enroot, pyxis = var.pyxis, pam = var.pam, + sacct_limits = var.sacct_limits, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, @@ -376,6 +377,7 @@ resource "null_resource" "cluster_backup" { enroot = var.enroot, pyxis = var.pyxis, pam = var.pam, + sacct_limits = var.sacct_limits, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, From ac01739099ca7494a9c818d2e1a14ace2b50b611 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Feb 2023 14:53:16 -0700 Subject: [PATCH 084/133] Run Fix_broken on the login node as well --- playbooks/site.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/site.yml b/playbooks/site.yml index 05f610c9..46ce44c5 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -10,7 +10,7 @@ - hostname # for ubuntu, on all compute nodes, run --fix-broken install -- hosts: compute +- hosts: compute, login become: true tasks: - include_role: From 17744e9db58b85b058272fd70eb04ca8ddc163c5 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 16 Feb 2023 13:34:17 -0800 Subject: [PATCH 085/133] added safe uninstall for ubuntu packages, updated bastion.sh to handle apt update and installation, updated bastion.tf to use bash --- bastion.tf | 1 + bin/bastion.sh | 60 ++++++++++++++++++--- playbooks/roles/fix_broken/tasks/ubuntu.yml | 11 ++-- playbooks/roles/openldap/tasks/debian.yml | 11 ++-- playbooks/roles/safe_yum/tasks/ubuntu.yml | 3 +- 5 files changed, 71 insertions(+), 15 deletions(-) diff --git a/bastion.tf b/bastion.tf index 9442aa21..ef3593e7 100644 --- a/bastion.tf +++ b/bastion.tf @@ -432,6 +432,7 @@ provisioner "file" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", diff --git a/bin/bastion.sh b/bin/bastion.sh index c1cadae0..40c427c8 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -74,6 +74,21 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sleep 10s + sudo apt -y --fix-broken install + + # checking here as well to be sure that the lock file is not being held + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done + + wget -O- https://apt.releases.hashicorp.com/gpg | \ gpg --dearmor | \ sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg @@ -81,14 +96,47 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ sudo tee /etc/apt/sources.list.d/hashicorp.list + + sudo apt-get -y install terraform + # checking here as well to be sure that the lock file is not being held + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done - sudo apt-get update & - PID1=$! - wait $PID1 - - sudo apt -y --fix-broken install + sudo apt-get -y install ansible + + # checking here as well to be sure that the lock file is not being held + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done + + sudo apt-get -y install python python-netaddr python3 python3-pip - sudo apt-get -y install ansible python python-netaddr python3-pip terraform + # checking here as well to be sure that the lock file is not being held + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done pip install pip --upgrade pip install pyopenssl --upgrade diff --git a/playbooks/roles/fix_broken/tasks/ubuntu.yml b/playbooks/roles/fix_broken/tasks/ubuntu.yml index 81e27814..a522df45 100644 --- a/playbooks/roles/fix_broken/tasks/ubuntu.yml +++ b/playbooks/roles/fix_broken/tasks/ubuntu.yml @@ -27,10 +27,13 @@ until: result.stdout | int == 0 - name: Purge unattended-upgrades - apt: - name: unattended-upgrades - purge: yes - state: absent + vars: + package_name: + - unattended-upgrades + package_state: absent + package_purge: true + include_role: + name: safe_yum ignore_errors: yes - name: stop and mask timers diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 5234df9a..93f890c1 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -14,10 +14,13 @@ failed_when: false - name: Remove Apparmor package - apt: - name: apparmor - state: absent - purge: true + vars: + package_name: + - apparmor + package_state: absent + package_purge: true + include_role: + name: safe_yum - name: Create /etc/opt/oci-hpc/passwords/openldap become: true diff --git a/playbooks/roles/safe_yum/tasks/ubuntu.yml b/playbooks/roles/safe_yum/tasks/ubuntu.yml index 63a7cb80..1eca898a 100755 --- a/playbooks/roles/safe_yum/tasks/ubuntu.yml +++ b/playbooks/roles/safe_yum/tasks/ubuntu.yml @@ -7,11 +7,12 @@ delay: 10 until: result.stdout | int == 0 -- name: "Installing {{package_name}}" +- name: "Installing/Removing {{package_name}}" become: true apt: name: "{{package_name}}" state: "{{package_state | default('latest')}}" + purge: "{{package_purge | default('false')}}" register: result until: result is not failed retries: 5 From ea0f64df02c99251ed82b6a09524731ac8fd9384 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 16 Feb 2023 13:37:52 -0800 Subject: [PATCH 086/133] updated remote exec default shell to bash to resolve exit status error --- bastion.tf | 2 ++ slurm_ha.tf | 3 +++ 2 files changed, 5 insertions(+) diff --git a/bastion.tf b/bastion.tf index ef3593e7..1e8309ed 100644 --- a/bastion.tf +++ b/bastion.tf @@ -74,6 +74,7 @@ resource "null_resource" "bastion" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.bastion_username}:${var.bastion_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", @@ -176,6 +177,7 @@ resource "null_resource" "bastion" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", "chmod a+x /opt/oci-hpc/bin/*.sh", diff --git a/slurm_ha.tf b/slurm_ha.tf index 388e76ec..6c7db37e 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -67,6 +67,7 @@ resource "null_resource" "backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.bastion_username}:${var.bastion_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", @@ -169,6 +170,7 @@ resource "null_resource" "backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", "chmod a+x /opt/oci-hpc/bin/*.sh", @@ -408,6 +410,7 @@ resource "null_resource" "cluster_backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh", "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log", From acfc29e85f37337e341831f7cc8e89b44a491784 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 16 Feb 2023 15:23:54 -0700 Subject: [PATCH 087/133] Fix typo --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index b94f4796..ad9e9d1d 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1032,7 +1032,7 @@ variables: sacct_limits: type: boolean - title: "Enable Limits for SLurm jobs" + title: "Enable Limits for Slurm jobs" default: false description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs of the right limits are not set" From 09b7d4809fd78f0e2e7f8605b1350b4a550706d5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 21 Feb 2023 19:04:00 -0700 Subject: [PATCH 088/133] Change the default RDMA subnet size --- schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schema.yaml b/schema.yaml index ad9e9d1d..ed05331d 100755 --- a/schema.yaml +++ b/schema.yaml @@ -941,8 +941,8 @@ variables: rdma_subnet: type: string title: "RDMA subnet IP range" - default: "192.168.168.0/22" - description: "Must be the same size as private subnet" + default: "192.168.0.0/16" + description: "Must be at least the same size as private subnet for HPC and at least 16 times the size of the private subnet for GPUs" required: true private_subnet: type: string From c8f5e254e648b625d848379027663247e8d0f731 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Feb 2023 15:15:44 -0700 Subject: [PATCH 089/133] Add generic MPI path. --- samples/gpu/nccl_run_allreduce.sbatch | 9 +++------ samples/gpu/nccl_run_allreduce.sh | 10 +++++----- samples/gpu/nccl_run_alltoall.sh | 9 ++++----- samples/gpu/qfabv1_nccl_run_allreduce.sbatch | 9 ++++----- samples/gpu/qfabv1_nccl_run_allreduce.sh | 9 ++++----- samples/gpu/qfabv1_nccl_run_alltoall.sh | 9 ++++----- 6 files changed, 24 insertions(+), 31 deletions(-) diff --git a/samples/gpu/nccl_run_allreduce.sbatch b/samples/gpu/nccl_run_allreduce.sbatch index 505d60a3..533d6cf7 100644 --- a/samples/gpu/nccl_run_allreduce.sbatch +++ b/samples/gpu/nccl_run_allreduce.sbatch @@ -26,13 +26,10 @@ cat $ORDEREDMACHINEFILE echo ORDEREDRANKMACHINEFILE cat $ORDEREDRANKMACHINEFILE +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` +source $mpivars_path -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh -fi - +if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi export NCCL_DEBUG=WARN diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh index 2edd75f8..fd2ae7fc 100644 --- a/samples/gpu/nccl_run_allreduce.sh +++ b/samples/gpu/nccl_run_allreduce.sh @@ -42,11 +42,11 @@ do hostfile=$hostfile; np=$np ; iter=20; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi + first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh index 896f4a56..e1be500d 100644 --- a/samples/gpu/nccl_run_alltoall.sh +++ b/samples/gpu/nccl_run_alltoall.sh @@ -49,11 +49,10 @@ do hostfile=$hostfile; np=$np ; iter=50; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch index 92c624a5..203f3ba6 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch @@ -26,11 +26,10 @@ cat $ORDEREDMACHINEFILE echo ORDEREDRANKMACHINEFILE cat $ORDEREDRANKMACHINEFILE -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh -fi +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` +source $mpivars_path + +if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi #source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh #source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh index 831b184e..f5c5cad6 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sh +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sh @@ -43,11 +43,10 @@ do hostfile=$hostfile; np=$np ; iter=20; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` diff --git a/samples/gpu/qfabv1_nccl_run_alltoall.sh b/samples/gpu/qfabv1_nccl_run_alltoall.sh index 565bffe5..a9d217d8 100644 --- a/samples/gpu/qfabv1_nccl_run_alltoall.sh +++ b/samples/gpu/qfabv1_nccl_run_alltoall.sh @@ -51,11 +51,10 @@ do hostfile=$hostfile; np=$np ; iter=50; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` From 6e475fa2b0558df81ed291d0689065bb1e62d03a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Feb 2023 15:16:03 -0700 Subject: [PATCH 090/133] Also copy the node_ordering_by_rack.py file --- samples/prep_sample_files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/prep_sample_files.sh b/samples/prep_sample_files.sh index fa10ee1a..0b9d78fb 100644 --- a/samples/prep_sample_files.sh +++ b/samples/prep_sample_files.sh @@ -9,5 +9,5 @@ done; cp nccl_compile/compile.sh /home/opc/ cp gpu/*.sbatch /home/opc/ - +cp /opt/oci-hpc/bin/node_ordering_by_rack.py /home/opc/ From 103a5a2055249bfd66056de41d082201250c3e81 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Feb 2023 15:16:11 -0700 Subject: [PATCH 091/133] Add NCCL Readme --- samples/NCCL_readme | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 samples/NCCL_readme diff --git a/samples/NCCL_readme b/samples/NCCL_readme new file mode 100644 index 00000000..9279fd69 --- /dev/null +++ b/samples/NCCL_readme @@ -0,0 +1,11 @@ +To Run a NCCL test, run the following commands: +chmod 775 /opt/oci-hpc/samples/prep_sample_files.sh +/opt/oci-hpc/samples/prep_sample_files.sh + +SSH to one of the compute nodes and run: ~/compile.sh + +From the bastion, you can edit the third line of /home/opc/nccl_run_allreduce.sbatch with the number of nodes that you would like to test on: +sbatch /home/opc/nccl_run_allreduce.sbatch + +Look at the last line of the log for bandwidth. + From 9cdb58a61ea1890287bca192f04972c7d13d2732 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Feb 2023 15:16:28 -0700 Subject: [PATCH 092/133] Find a generic MPI PATH --- samples/nccl_compile/compile.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index a6d85d6b..d7453bb0 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -2,13 +2,10 @@ # Run on 1 GPU node only -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - MPI_HOME=/usr/mpi/gcc/openmpi-4.1.0rc5 -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - MPI_HOME=/usr/mpi/gcc/openmpi-4.0.3rc4 -fi +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` +source $mpivars_path + +if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi cd /home/opc From c516c0bd2bf0c5b1093a1f6cdaf8783c9af6ff3f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 11:53:36 -0700 Subject: [PATCH 093/133] Add login node to the topology.conf --- playbooks/roles/slurm/templates/topology.conf.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/playbooks/roles/slurm/templates/topology.conf.j2 b/playbooks/roles/slurm/templates/topology.conf.j2 index 7da9d362..66654ab1 100644 --- a/playbooks/roles/slurm/templates/topology.conf.j2 +++ b/playbooks/roles/slurm/templates/topology.conf.j2 @@ -1,4 +1,7 @@ ### Topology File +{% if (groups['login']| length ) > 0 %} +SwitchName=login-node Nodes={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} +{% endif %} {% for partition in queues %} {% for instance in partition.instance_types %} {% set size = instance.private_subnet | ipaddr('size')%} From fa8e7087833ac6f4348d709edc4c7fda428c4583 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 17:46:33 -0700 Subject: [PATCH 094/133] Add support for A10 VMs --- playbooks/roles/slurm/templates/gres.conf.j2 | 4 ++++ playbooks/roles/slurm/templates/slurm.conf.j2 | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index 1356a2ae..4a219a85 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -36,6 +36,10 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name {% elif instance.shape == "BM.GPU.A10.4" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] +{% elif instance.shape == "VM.GPU.A10.2" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] +{% elif instance.shape == "VM.GPU.A10.1" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] {% endif %} {% endfor %} {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 53856139..7ad0c47a 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -88,6 +88,10 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 {% elif instance.shape == "BM.GPU.A10.4" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:4 +{% elif instance.shape == "VM.GPU.A10.2" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=30 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 +{% elif instance.shape == "VM.GPU.A10.1" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=15 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:1 {% elif instance.shape == "VM.Standard.E3.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.E4.Flex" %} From 3c5add5da0e49c3079454f282567ef94ca803f54 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 17:47:29 -0700 Subject: [PATCH 095/133] Mount NFS client on login node and regroup play --- playbooks/site.yml | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/playbooks/site.yml b/playbooks/site.yml index 46ce44c5..abee7284 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -81,7 +81,7 @@ name: fss-home when: add_nfs|bool and home_fss|bool -- hosts: bastion, slurm_backup +- hosts: bastion, slurm_backup, login become: true tasks: - include_role: @@ -164,7 +164,7 @@ - include_role: name: mysql -- hosts: slurm_backup +- hosts: slurm_backup, login become: true vars: iscsi_ip: "{{ bastion_mount_ip }}" @@ -175,17 +175,6 @@ name: iscsi when: bastion_block|default(false)|bool -- hosts: login - become: true - vars: - iscsi_ip: "{{ bastion_mount_ip }}" - tasks: - - include_role: - name: passwords - - include_role: - name: iscsi - when: login_block|default(false)|bool - - hosts: nfs become: true vars: From b4f80237e5e1c1cd75fd5bbce291bb71d3827ecc Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 17:48:54 -0700 Subject: [PATCH 096/133] Move apt to safe_yum --- playbooks/roles/cluster-cli/tasks/debian.yml | 11 ++++---- .../roles/nvidia-container/tasks/ubuntu.yml | 16 +++++------ .../roles/nvidia-enroot/tasks/ubuntu.yml | 17 +++++------- playbooks/roles/openldap/tasks/debian.yml | 10 ++++--- playbooks/roles/safe_yum/tasks/ubuntu.yml | 17 ++++++++++++ playbooks/roles/slurm/tasks/common.yml | 10 ++++--- playbooks/roles/spack/tasks/debian.yml | 27 +++++++++---------- playbooks/roles/sssd/tasks/debian.yml | 10 ++++--- 8 files changed, 68 insertions(+), 50 deletions(-) diff --git a/playbooks/roles/cluster-cli/tasks/debian.yml b/playbooks/roles/cluster-cli/tasks/debian.yml index 7e0f9e57..c1f5e422 100644 --- a/playbooks/roles/cluster-cli/tasks/debian.yml +++ b/playbooks/roles/cluster-cli/tasks/debian.yml @@ -1,12 +1,13 @@ --- - - name: Install required packages - apt: - name: + vars: + package_name: - python3-click - python3-ldap3 - state: present - update_cache: yes + package_state: present + package_cache: true + include_role: + name: safe_yum - name: copy cluster cli copy: diff --git a/playbooks/roles/nvidia-container/tasks/ubuntu.yml b/playbooks/roles/nvidia-container/tasks/ubuntu.yml index 9bbc1537..49fe6d8a 100644 --- a/playbooks/roles/nvidia-container/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-container/tasks/ubuntu.yml @@ -13,15 +13,15 @@ owner: root group: root + - name: install packages - apt: - name: libnvidia-container-tools{{ libnvidia_container_tools_package_version | ternary("="+libnvidia_container_tools_package_version, "") }} - state: "{{ libnvidia_container_tools_package_state }}" - update_cache: yes - register: result - until: result is not failed - retries: 5 - delay: 5 + vars: + package_name: + - libnvidia-container-tools{{ libnvidia_container_tools_package_version | ternary("="+libnvidia_container_tools_package_version, "") }} + package_state: "{{ libnvidia_container_tools_package_state }}" + package_cache: true + include_role: + name: safe_yum - name: Install nvidia-container-toolkit vars: diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index 00828be1..c527a9f6 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -6,16 +6,13 @@ dpkg_arch: "{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" - name: install required packages - apt: - deb: '{{ item }}' - with_items: - - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot_3.4.0-1_{{ dpkg_arch }}.deb" - - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot+caps_3.4.0-1_{{ dpkg_arch }}.deb" - register: result - until: result is not failed - retries: 5 - delay: 5 - + vars: + deb_name: + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot_3.4.0-1_{{ dpkg_arch }}.deb" + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot+caps_3.4.0-1_{{ dpkg_arch }}.deb" + package_state: present + include_role: + name: safe_yum - name: set kernel.unprivileged_userns_clone using sysctl ansible.posix.sysctl: name: kernel.unprivileged_userns_clone diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 93f890c1..49215f6c 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -45,10 +45,12 @@ chars=ascii_letters,digits,hexdigits') }}" - name: Install the openldap and required Packages for Ubuntu - apt: - name: "{{ openldap_packages }}" - state: present - update_cache: yes + vars: + package_name: "{{ openldap_packages }}" + package_state: present + package_cache: true + include_role: + name: safe_yum - name: Hash OpenLDAP root password command: slappasswd -h {SSHA} -s {{ openldap_root_pwd }} diff --git a/playbooks/roles/safe_yum/tasks/ubuntu.yml b/playbooks/roles/safe_yum/tasks/ubuntu.yml index 1eca898a..6ad15a88 100755 --- a/playbooks/roles/safe_yum/tasks/ubuntu.yml +++ b/playbooks/roles/safe_yum/tasks/ubuntu.yml @@ -13,10 +13,27 @@ name: "{{package_name}}" state: "{{package_state | default('latest')}}" purge: "{{package_purge | default('false')}}" + update_cache: "{{package_cache | default('false')}}" register: result until: result is not failed retries: 5 delay: 5 + when: not deb_name is defined + +- name: "Installing/Removing {{package_name}}" + become: true + apt: + deb: "{{item}}" + state: "{{package_state | default('latest')}}" + purge: "{{package_purge | default('false')}}" + update_cache: "{{package_cache | default('false')}}" + register: result + until: result is not failed + retries: 5 + delay: 5 + when: deb_name is defined + with_items: "{{deb_name}}" + - name: Ensure apt process is completed become: true diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 53afa054..24287a59 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -92,10 +92,12 @@ run_once: true - name: Install .deb - become: true - apt: - deb: "{{ download_path }}/slurm_rpms/slurm-22.05.4-1_amd64.deb" - state: present + vars: + deb_name: + - "{{ download_path }}/slurm_rpms/slurm-22.05.4-1_amd64.deb" + package_state: present + include_role: + name: safe_yum when: ansible_os_family == 'Debian' - name: install SLURM common packages RedHat diff --git a/playbooks/roles/spack/tasks/debian.yml b/playbooks/roles/spack/tasks/debian.yml index 45006236..841fafc7 100644 --- a/playbooks/roles/spack/tasks/debian.yml +++ b/playbooks/roles/spack/tasks/debian.yml @@ -1,24 +1,21 @@ --- - name: install GIT - apt: - name: git - state: latest - become: true + vars: + package_name: + - git + package_state: latest + include_role: + name: safe_yum when: cluster_nfs - register: result - until: result is not failed - retries: 5 - delay: 5 - name: Development Tools" - apt: - name: build-essential - become: true + vars: + package_name: + - build-essential + package_state: latest + include_role: + name: safe_yum when: cluster_nfs - register: result - until: result is not failed - retries: 5 - delay: 5 - name: Clone SPACK git: diff --git a/playbooks/roles/sssd/tasks/debian.yml b/playbooks/roles/sssd/tasks/debian.yml index b5f8ec9d..9e0c1d71 100644 --- a/playbooks/roles/sssd/tasks/debian.yml +++ b/playbooks/roles/sssd/tasks/debian.yml @@ -46,10 +46,12 @@ replace: 'password [success=1 user_unknown=ignore default=die] pam_ldap.so try_first_pass' - name: Install the openldap and required Packages for Ubuntu - apt: - name: "{{ openldap_packages }}" - state: present - update_cache: yes + vars: + package_name: "{{ openldap_packages }}" + package_state: present + package_cache: true + include_role: + name: safe_yum - name: Update sshd configuration lineinfile: From 811958ca2154c4258fab912ade79a1c251794d20 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 17:49:21 -0700 Subject: [PATCH 097/133] add sudo priviledge for moving file --- playbooks/roles/rack-aware/tasks/el.yml | 2 ++ playbooks/roles/rack-aware/tasks/ubuntu.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/playbooks/roles/rack-aware/tasks/el.yml b/playbooks/roles/rack-aware/tasks/el.yml index adedeaa3..56bab6c8 100644 --- a/playbooks/roles/rack-aware/tasks/el.yml +++ b/playbooks/roles/rack-aware/tasks/el.yml @@ -25,6 +25,7 @@ - name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ @@ -33,6 +34,7 @@ mode: '0755' rescue: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ diff --git a/playbooks/roles/rack-aware/tasks/ubuntu.yml b/playbooks/roles/rack-aware/tasks/ubuntu.yml index c80a4cbb..1d0cc93f 100644 --- a/playbooks/roles/rack-aware/tasks/ubuntu.yml +++ b/playbooks/roles/rack-aware/tasks/ubuntu.yml @@ -21,6 +21,7 @@ - name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ @@ -29,6 +30,7 @@ mode: '0755' rescue: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ From 057671328ca5f9277dfca117a71bf32aa746ccc7 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Feb 2023 17:49:52 -0700 Subject: [PATCH 098/133] Use function to avoid code duplication --- bin/bastion.sh | 108 +++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 71 deletions(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index 40c427c8..e0aa30e3 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -38,6 +38,19 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 93C4A3FD7BB9C367 fi + # checking here as well to be sure that the lock file is not being held + function fix_apt { + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done + } sudo sed -i 's/"1"/"0"/g' /etc/apt/apt.conf.d/20auto-upgrades sudo apt purge -y --auto-remove unattended-upgrades sudo systemctl disable apt-daily-upgrade.timer @@ -45,48 +58,17 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo systemctl disable apt-daily.timer sudo systemctl mask apt-daily.service - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done + sleep 10s sudo apt-mark hold linux-oracle linux-headers-oracle linux-image-oracle - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done - + fix_apt sleep 10s - sudo apt -y --fix-broken install - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done + fix_apt wget -O- https://apt.releases.hashicorp.com/gpg | \ @@ -98,45 +80,29 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo tee /etc/apt/sources.list.d/hashicorp.list sudo apt-get -y install terraform - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done - + output=$? + if [ $output -ne 0 ] + then + fix_apt + sudo apt-get -y install terraform + fi + fix_apt sudo apt-get -y install ansible - - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done - + output=$? + if [ $output -ne 0 ] + then + fix_apt + sudo apt-get -y install ansible + fi + fix_apt sudo apt-get -y install python python-netaddr python3 python3-pip - - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done + output=$? + if [ $output -ne 0 ] + then + fix_apt + sudo apt-get -y install python python-netaddr python3 python3-pip + fi + fix_apt pip install pip --upgrade pip install pyopenssl --upgrade From 663496ff0a651f5458fb06abd0e2e3f4dec65d22 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 24 Feb 2023 16:28:27 -0700 Subject: [PATCH 099/133] Fix Slurm bug with login and backup node --- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 3d8a23f0..6da70b8f 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -140,7 +140,7 @@ - name: Get hostlist if switch exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['(groups['login'])']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}" register: rack_hostlist1 delegate_to: 127.0.0.1 From e9ffd64e4f857c1d96217073ce6c70951993834d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 24 Feb 2023 22:43:33 -0700 Subject: [PATCH 100/133] Fix the terraform install for Ubuntu --- bin/bastion.sh | 55 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index e0aa30e3..8d6a83f7 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -33,11 +33,6 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then sudo yum install -y terraform elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then - if [ $ID == "debian" ] && [ $VERSION_ID == "9" ] ; then - echo deb http://ppa.launchpad.net/ansible/ansible/ubuntu trusty main | sudo tee -a /etc/apt/sources.list - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 93C4A3FD7BB9C367 - fi - # checking here as well to be sure that the lock file is not being held function fix_apt { apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` @@ -51,6 +46,14 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then apt_process=$(( apt_process -1 )) done } + fix_apt + + if [ $ID == "debian" ] && [ $VERSION_ID == "9" ] ; then + echo deb http://ppa.launchpad.net/ansible/ansible/ubuntu trusty main | sudo tee -a /etc/apt/sources.list + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 93C4A3FD7BB9C367 + fi + + sudo sed -i 's/"1"/"0"/g' /etc/apt/apt.conf.d/20auto-upgrades sudo apt purge -y --auto-remove unattended-upgrades sudo systemctl disable apt-daily-upgrade.timer @@ -70,23 +73,6 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then fix_apt - - wget -O- https://apt.releases.hashicorp.com/gpg | \ - gpg --dearmor | \ - sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg - - echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ - https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ - sudo tee /etc/apt/sources.list.d/hashicorp.list - - sudo apt-get -y install terraform - output=$? - if [ $output -ne 0 ] - then - fix_apt - sudo apt-get -y install terraform - fi - fix_apt sudo apt-get -y install ansible output=$? if [ $output -ne 0 ] @@ -113,6 +99,31 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then # install oci module pip install oci + wget -O- https://apt.releases.hashicorp.com/gpg | \ + gpg --dearmor | \ + sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg + + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ + https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ + sudo tee /etc/apt/sources.list.d/hashicorp.list + + sudo apt update && sudo apt install terraform + output=$? + if [ $output -ne 0 ] + then + fix_apt + echo "Terraform second try" + wget -O- https://apt.releases.hashicorp.com/gpg | \ + gpg --dearmor | \ + sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg + + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ + https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ + sudo tee /etc/apt/sources.list.d/hashicorp.list + + sudo apt update && sudo apt install terraform + fi + fix_apt fi ansible-galaxy collection install ansible.netcommon:=2.5.1 --force > /dev/null From 3082336e5e88f4d262a64218fc4a25c78f1985f2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 27 Feb 2023 13:57:12 -0700 Subject: [PATCH 101/133] Add login node to /etc/hosts role --- playbooks/new_nodes.yml | 2 +- playbooks/resize_add.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index d24c173b..8c24a80a 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -49,7 +49,7 @@ name: rdma-interface when: cluster_network|bool -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,login,compute become: true vars: destroy: false diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 296d017c..11ed903e 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -47,7 +47,7 @@ name: rdma-interface when: cluster_network|bool -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,login,compute become: true vars: destroy: false From ad46a54617ed60bd3a38896e90c0fe86d2204a37 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 27 Feb 2023 16:52:11 -0800 Subject: [PATCH 102/133] add MPI_HOME to compile NCCL tests --- samples/nccl_compile/compile.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index d7453bb0..df8f46b8 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -7,6 +7,7 @@ source $mpivars_path if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi +MPI_HOME=${mpivars_path%%/bin*} cd /home/opc git clone https://github.com/NVIDIA/nccl-tests.git From beea1deac41676ec8482cfaddbc284708abd6fa2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Feb 2023 17:26:31 -0700 Subject: [PATCH 103/133] Fix Typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b487add0..eb1771e2 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ Example: The name of the cluster must be queueName-clusterNumber-instanceType_keyword -The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be regirstered in Slurm +The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm ### Cluster Deletion: ``` From adc0ba38f006e4ed468904a7d1d82c54500d0582 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Feb 2023 18:21:49 -0700 Subject: [PATCH 104/133] Fix RDMA issues with subnet size --- autoscaling/tf_init/bastion_update.tf | 2 ++ autoscaling/tf_init/inventory.tpl | 4 ++-- bastion.tf | 1 + conf/variables.tpl | 1 + slurm_ha.tf | 1 + variables.tf | 2 +- 6 files changed, 8 insertions(+), 3 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index f4dd0faf..316af6ff 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -27,6 +27,8 @@ resource "local_file" "inventory" { compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, + rdma_network = cidrhost(var.rdma_subnet, 0), + rdma_netmask = cidrnetmask(var.rdma_subnet), nfs = var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs, cluster_nfs = var.use_cluster_nfs, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index c02b0548..9d2c062d 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -21,8 +21,8 @@ compute [all:vars] ansible_connection=ssh ansible_user=${compute_username} -rdma_network=192.168.128.0 -rdma_netmask=255.255.240.0 +rdma_network=${rdma_network} +rdma_netmask=${rdma_netmask} public_subnet=${public_subnet} private_subnet=${private_subnet} nvme_path=/mnt/localdisk/ diff --git a/bastion.tf b/bastion.tf index 578d7b97..172d630f 100644 --- a/bastion.tf +++ b/bastion.tf @@ -343,6 +343,7 @@ resource "null_resource" "cluster" { public_subnet_id = local.bastion_subnet_id, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, + rdma_subnet = var.rdma_subnet, nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, diff --git a/conf/variables.tpl b/conf/variables.tpl index 2744d019..9a100245 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -24,6 +24,7 @@ variable "public_subnet_id" { default = "${public_subnet_id}"} variable "public_subnet" {default = "${public_subnet}"} variable "private_subnet_id" { default = "##PRIVATE_SUBNET_ID##"} variable "private_subnet" {default = "##PRIVATE_SUBNET##"} +variable "rdma_subnet" { default = "${rdma_subnet}" } variable "slurm" { default = ${slurm} } variable "rack_aware" { default = ${rack_aware} } variable "pyxis" { default = ${pyxis} } diff --git a/slurm_ha.tf b/slurm_ha.tf index 04f64f1b..dfa9b507 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -337,6 +337,7 @@ resource "null_resource" "cluster_backup" { public_subnet_id = local.bastion_subnet_id, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, + rdma_subnet = var.rdma_subnet, nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, diff --git a/variables.tf b/variables.tf index d0774658..8c3862ab 100755 --- a/variables.tf +++ b/variables.tf @@ -44,7 +44,7 @@ variable "private_subnet_id" { default = ""} variable "vcn_subnet" { default = "172.16.0.0/21" } variable "public_subnet" { default = "172.16.0.0/24" } variable "additional_subnet" { default = "172.16.1.0/24" } -variable "rdma_subnet" { default = "192.168.168.0/22" } +variable "rdma_subnet" { default = "192.168.0.0/16" } variable "private_subnet" { default = "172.16.4.0/22" } variable "ssh_cidr" { default = "0.0.0.0/0" } variable "slurm" { default = false } From 9f033a25a1776f9a0d4db86c8bf3289d7bc0fb7e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Feb 2023 18:22:14 -0700 Subject: [PATCH 105/133] mysql-connector needs v8.0.31 for python36 --- playbooks/roles/autoscaling_mon/files/initial.sh | 2 +- playbooks/roles/autoscaling_mon/tasks/el.yml | 2 +- playbooks/roles/autoscaling_mon/tasks/ubuntu.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/autoscaling_mon/files/initial.sh b/playbooks/roles/autoscaling_mon/files/initial.sh index 77d5198a..b4c8bc87 100644 --- a/playbooks/roles/autoscaling_mon/files/initial.sh +++ b/playbooks/roles/autoscaling_mon/files/initial.sh @@ -3,7 +3,7 @@ sudo yum install -y grafana-7.5.0-1.x86_64.rpm sudo yum install -y https://dev.mysql.com/get/mysql80-community-release-el7-3.noarch.rpm sudo yum install -y mysql-shell sudo pip3 install protobuf==3.19.4 -sudo pip3 install mysql-connector-python +sudo pip3 install mysql-connector-python==8.0.31 sudo systemctl daemon-reload sudo systemctl start grafana-server sudo systemctl status grafana-server diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index 9d9946d4..5de05e2d 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -213,6 +213,6 @@ - name: install protobuf v3.19.4 and mysql connector become: true pip: - name: [protobuf==3.19.4,mysql-connector-python] + name: [protobuf==3.19.4,mysql-connector-python==8.0.31] executable: pip3 ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index 0224d4bc..11db136f 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -251,6 +251,6 @@ vars: ansible_python_interpreter: /usr/bin/python3 pip: - name: [protobuf==3.19.4,mysql-connector-python] + name: [protobuf==3.19.4,mysql-connector-python==8.0.31] executable: pip3 ignore_errors: yes \ No newline at end of file From 346e7e8f026e60f02675eaeea5bb1092ddc1b31b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Mar 2023 11:48:57 -0700 Subject: [PATCH 106/133] Add check to not run the resize as root --- bin/resize.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/resize.sh b/bin/resize.sh index ba956ce2..2d436216 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -9,6 +9,13 @@ autoscaling_folder=$folder/../autoscaling monitoring_folder=$folder/../monitoring logs_folder=$folder/../logs +currentuser=`whoami` +if [ "$currentuser" == "root" ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + if [ $# -eq 0 ] then python3 $folder/resize.py --help From f579256d79d9293d2c99475ac2f3a86dcb921bdc Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 1 Mar 2023 12:39:09 -0800 Subject: [PATCH 107/133] updated nccl compile and nccl sbatch to work on ubuntu as well. corrected pcie in validation script. --- bin/validation.py | 4 ++-- samples/gpu/nccl_run_allreduce.sbatch | 23 +++++++++++++++++++---- samples/nccl_compile/compile.sh | 20 ++++++++++++++++---- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/bin/validation.py b/bin/validation.py index 45b37ac2..8c28b2a1 100644 --- a/bin/validation.py +++ b/bin/validation.py @@ -419,12 +419,12 @@ def pcie_check(hostfile, path): if "Linux" in os_name[0]: out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_el.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_el.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() elif "Ubuntu" in os_name[0]: out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_ubuntu.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_ubuntu.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() else: print("Cannot run pcie check as OS is not determined to be Linux or Ubuntu") diff --git a/samples/gpu/nccl_run_allreduce.sbatch b/samples/gpu/nccl_run_allreduce.sbatch index 533d6cf7..bbcfa484 100644 --- a/samples/gpu/nccl_run_allreduce.sbatch +++ b/samples/gpu/nccl_run_allreduce.sbatch @@ -19,7 +19,15 @@ scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE echo MACHINEFILE cat $MACHINEFILE -python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=opc +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=ubuntu +fi + echo ORDEREDMACHINEFILE cat $ORDEREDMACHINEFILE @@ -27,9 +35,15 @@ echo ORDEREDRANKMACHINEFILE cat $ORDEREDRANKMACHINEFILE mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` -source $mpivars_path -if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path export NCCL_DEBUG=WARN @@ -48,6 +62,7 @@ then fi mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ -x NCCL_IB_TC=41 \ @@ -59,6 +74,6 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index df8f46b8..dbf37e8a 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -1,15 +1,27 @@ -#!/bin/bash +#!/bin/bash # Run on 1 GPU node only mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` -source $mpivars_path -if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi +source $mpivars_path MPI_HOME=${mpivars_path%%/bin*} -cd /home/opc +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + cd /home/opc +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + cd /home/ubuntu +fi + + git clone https://github.com/NVIDIA/nccl-tests.git cd nccl-tests/ make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=/usr/local/cuda From b6d03274b7f2a3c12973da5e620a4de663081edf Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Mar 2023 15:05:08 -0700 Subject: [PATCH 108/133] Add check to not run scripts as root --- bin/cleanup.sh | 6 ++++++ bin/create_cluster.sh | 7 +++++++ bin/delete_cluster.sh | 7 +++++++ bin/resize.sh | 3 +-- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/bin/cleanup.sh b/bin/cleanup.sh index 3df85c4f..54725e88 100755 --- a/bin/cleanup.sh +++ b/bin/cleanup.sh @@ -6,6 +6,12 @@ folder=`dirname $scripts` playbooks_path=$folder/../playbooks/ inventory_path=$folder/../autoscaling/clusters/$1 +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + ssh_options="-i ~/.ssh/id_rsa -o StrictHostKeyChecking=no" iplist=`cat $inventory_path/inventory | awk '{print $2}' | sed 's/ansible_host=//'` if [[ "$2" == "FORCE" ]] diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index e88a2e69..0cdba575 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -17,6 +17,13 @@ then else debug=0 fi + +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + date=`date '+%Y%m%d%H%M'` scripts=`realpath $0` folder=`dirname $scripts` diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh index f771e1da..7328a206 100755 --- a/bin/delete_cluster.sh +++ b/bin/delete_cluster.sh @@ -5,6 +5,13 @@ then echo "No arguments supplied" exit fi + +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + date=`date -u '+%Y%m%d%H%M'` start=`date -u +%s` start_timestamp=`date -u +'%F %T'` diff --git a/bin/resize.sh b/bin/resize.sh index 2d436216..d2082db8 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -9,8 +9,7 @@ autoscaling_folder=$folder/../autoscaling monitoring_folder=$folder/../monitoring logs_folder=$folder/../logs -currentuser=`whoami` -if [ "$currentuser" == "root" ] +if [ $EUID -eq 0 ] then echo "Run this script as opc or ubuntu and not as root" exit From c36e8dac298b06648a9d3ec54a5cc7649a789099 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 6 Mar 2023 08:49:14 -0700 Subject: [PATCH 109/133] Change to latest TF provider --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 44a6c867..c5c30733 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.99.0" + version = "4.110.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index 44a6c867..c5c30733 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.99.0" + version = "4.110.0" } } } \ No newline at end of file From 8abe82d416bcf9b655dc6e430559dc6ecbd19386 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 6 Mar 2023 08:49:28 -0700 Subject: [PATCH 110/133] Add autodectect only on GPU nodes --- playbooks/roles/slurm/templates/gres.conf.j2 | 51 ++++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index 4a219a85..f241cdd9 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -1,45 +1,44 @@ {% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} {% for partition in queues %} {% for instance in partition.instance_types %} -AutoDetect=nvml {% if instance.shape == "BM.GPU2.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] AutoDetect=nvml {% elif instance.shape == "VM.GPU2.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.4"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] AutoDetect=nvml {% elif instance.shape == "BM.GPU3.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] AutoDetect=nvml {% elif instance.shape == "BM.GPU4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.B4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.A100-v2.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.T1.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml {% elif instance.shape == "BM.GPU.A10.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] AutoDetect=nvml {% elif instance.shape == "VM.GPU.A10.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] AutoDetect=nvml {% elif instance.shape == "VM.GPU.A10.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] AutoDetect=nvml {% endif %} {% endfor %} {% endfor %} \ No newline at end of file From 4c9c2fe3eedcab28d58c0bd3d60c76e3cf836fdd Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 7 Mar 2023 14:43:59 -0700 Subject: [PATCH 111/133] Do not check if nodes to delete are unreachable --- bin/resize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/resize.py b/bin/resize.py index f1c6d84a..50401061 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -655,7 +655,11 @@ def updateTFState(inventory,cluster_name,size): print("STDOUT: No list of nodes were specified and no unreachable nodes were found") exit(1) else: - reachable_instances,unreachable_instances=getreachable(inventory_instances,username,delay=10) + inventory_instances_to_test = [] + for instance_to_test in inventory_instances: + if not instance_to_test['display_name'] in hostnames: + inventory_instances_to_test.append(instance_to_test) + reachable_instances,unreachable_instances=getreachable(inventory_instances_to_test,username,delay=10) hostnames_to_remove=hostnames if len(unreachable_instances): print("STDOUT: At least one unreachable node is in the inventory and was not mentionned with OCI hostname to be removed. Trying anyway") From 0f324780f4d81ae591aeb2db88c41d6ed66e871a Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 9 Mar 2023 15:02:03 -0800 Subject: [PATCH 112/133] update /etc/hosts on backup slurm controller and login --- playbooks/resize_remove.yml | 2 +- playbooks/roles/etc-hosts/tasks/common.yml | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index e3dcecab..50a694e9 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -1,4 +1,4 @@ -- hosts: bastion, slurm_backup, compute +- hosts: bastion, slurm_backup, compute, login become: true gather_facts: true vars: diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 7fffd1bf..4a128bc0 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -52,13 +52,13 @@ run_once: true when: not destroy|bool and groups['compute']|length > 0 -- name: move /etc/hosts on backup slurm +- name: move /etc/hosts on backup slurm and login node become: true copy: dest: /etc/hosts src: /etc/hosts force: yes - when: ( not destroy|bool ) and ('slurm_backup' in group_names) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) - name: move /etc/hosts on all compute nodes become: true @@ -66,7 +66,7 @@ dest: /etc/hosts src: /tmp/hosts.etc.{{ cluster_name }} force: yes - when: ( not destroy|bool ) and (not 'bastion' in group_names) and (not 'slurm_backup' in group_names) + when: ( not destroy|bool ) and (not 'bastion' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) - name: remove cluster from etc-host become: true @@ -86,4 +86,14 @@ state: absent delegate_to: "{{ groups['slurm_backup'][0] }}" run_once: true - when: destroy|bool and (groups['slurm_backup']|length > 0)|bool \ No newline at end of file + when: destroy|bool and (groups['slurm_backup']|length > 0)|bool + +- name: remove cluster from etc-host on login + become: true + blockinfile: + dest: /etc/hosts + marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" + state: absent + delegate_to: "{{ groups['login'][0] }}" + run_once: true + when: destroy|bool and (groups['login']|length > 0)|bool \ No newline at end of file From 0a9325fe6d57bf90cd9680620867f8f91aec9ced Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 10 Mar 2023 11:30:00 -0800 Subject: [PATCH 113/133] update login and backup when removing nodes --- playbooks/resize_remove_unreachable.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 1760ab7d..835bd18d 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -1,4 +1,4 @@ -- hosts: bastion, compute +- hosts: bastion, compute, backup, login become: true gather_facts: true vars: From 202538634b802904086f0942e5bbc64e5c11db6a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 13 Mar 2023 13:38:23 -0600 Subject: [PATCH 114/133] Fix Enroot when the localdisk is not mounted. --- .../nvidia-enroot/tasks/oraclelinux-7.yml | 25 +++++++++++++------ .../roles/nvidia-enroot/tasks/ubuntu.yml | 25 +++++++++++++------ playbooks/roles/slurm/tasks/common_pyxis.yml | 11 ++++++++ playbooks/roles/slurm/templates/prolog.sh.j2 | 2 +- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml index bb584fdd..55367740 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml @@ -36,13 +36,22 @@ - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" + - name: + set_fact: + - enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + + - name: + set_fact: + - enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH lineinfile: dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_RUNTIME_PATH.*' - line: 'ENROOT_RUNTIME_PATH {{enroot_top_path}}/enroot_runtime/user-$(id -u)' + line: 'ENROOT_RUNTIME_PATH {{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)' backup: yes - name: update ENROOT_DATA_PATH @@ -50,7 +59,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_DATA_PATH.*' - line: 'ENROOT_DATA_PATH {{enroot_top_path}}/enroot_data/user-$(id -u)' + line: 'ENROOT_DATA_PATH {{enroot_top_path_checked}}/enroot_data/user-$(id -u)' backup: yes - name: update ENROOT_CACHE_PATH @@ -58,7 +67,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_CACHE_PATH.*' - line: 'ENROOT_CACHE_PATH {{enroot_top_path}}/enroot_cache' + line: 'ENROOT_CACHE_PATH {{enroot_top_path_checked}}/enroot_cache' backup: yes - name: update ENROOT_TEMP_PATH @@ -66,7 +75,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_TEMP_PATH.*' - line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' + line: 'ENROOT_TEMP_PATH {{enroot_top_path_checked}}/enroot_tmp' backup: yes - name: update ENROOT_SQUASH_OPTIONS @@ -86,10 +95,10 @@ backup: yes - - name: set permissions on {{enroot_top_path}} + - name: set permissions on {{enroot_top_path_checked}} become: true file: - path: "{{enroot_top_path}}" + path: "{{enroot_top_path_checked}}" state: directory owner: opc mode: 0777 @@ -97,9 +106,9 @@ recurse: no - - name: Make sure all {{enroot_top_path}} directories exist + - name: Make sure all {{enroot_top_path_checked}} directories exist file: - path: "{{enroot_top_path}}/{{item}}" + path: "{{enroot_top_path_checked}}/{{item}}" state: directory mode: '0775' owner: opc diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index c527a9f6..e1a94e10 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -32,13 +32,22 @@ - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" + - name: + set_fact: + - enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + + - name: + set_fact: + - enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH lineinfile: dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_RUNTIME_PATH.*' - line: 'ENROOT_RUNTIME_PATH {{enroot_top_path}}/enroot_runtime/user-$(id -u)' + line: 'ENROOT_RUNTIME_PATH {{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)' backup: yes - name: update ENROOT_DATA_PATH @@ -46,7 +55,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_DATA_PATH.*' - line: 'ENROOT_DATA_PATH {{enroot_top_path}}/enroot_data/user-$(id -u)' + line: 'ENROOT_DATA_PATH {{enroot_top_path_checked}}/enroot_data/user-$(id -u)' backup: yes - name: update ENROOT_CACHE_PATH @@ -54,7 +63,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_CACHE_PATH.*' - line: 'ENROOT_CACHE_PATH {{enroot_top_path}}/enroot_cache' + line: 'ENROOT_CACHE_PATH {{enroot_top_path_checked}}/enroot_cache' backup: yes - name: update ENROOT_TEMP_PATH @@ -62,7 +71,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_TEMP_PATH.*' - line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' + line: 'ENROOT_TEMP_PATH {{enroot_top_path_checked}}/enroot_tmp' backup: yes - name: update ENROOT_SQUASH_OPTIONS @@ -82,10 +91,10 @@ backup: yes - - name: set permissions on {{enroot_top_path}} + - name: set permissions on {{enroot_top_path_checked}} become: true file: - path: "{{enroot_top_path}}" + path: "{{enroot_top_path_checked}}" state: directory owner: "{{ ansible_user }}" mode: 0777 @@ -93,9 +102,9 @@ recurse: no - - name: Make sure all {{enroot_top_path}} directories exist + - name: Make sure all {{enroot_top_path_checked}} directories exist file: - path: "{{enroot_top_path}}/{{item}}" + path: "{{enroot_top_path_checked}}/{{item}}" state: directory mode: '0775' owner: "{{ ansible_user }}" diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml index 3d17f750..e3881da3 100644 --- a/playbooks/roles/slurm/tasks/common_pyxis.yml +++ b/playbooks/roles/slurm/tasks/common_pyxis.yml @@ -1,4 +1,15 @@ --- + +- name: + set_fact: + - enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + +- name: + set_fact: + - enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + - name: copy files become: true become_method: sudo diff --git a/playbooks/roles/slurm/templates/prolog.sh.j2 b/playbooks/roles/slurm/templates/prolog.sh.j2 index 0c799685..25f23573 100644 --- a/playbooks/roles/slurm/templates/prolog.sh.j2 +++ b/playbooks/roles/slurm/templates/prolog.sh.j2 @@ -1,5 +1,5 @@ #!/bin/sh -runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{enroot_top_path}}/enroot_runtime/user-$(id -u)"')" +runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)"')" mkdir -p "$runtime_path" chown "$SLURM_JOB_USER:$(id -g "$SLURM_JOB_USER")" "$runtime_path" #chmod 777 -R /tmp From 502a3c89ab8f853e017640b885c9a3cc84ff0959 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 13 Mar 2023 13:38:37 -0600 Subject: [PATCH 115/133] Only allow Pyxis if Enroot is selected. --- schema.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index ed05331d..f199a686 100755 --- a/schema.yaml +++ b/schema.yaml @@ -159,11 +159,11 @@ variableGroups: - ${slurm_nfs} - ${slurm_ha} - ${rack_aware} - - ${pyxis} - ${queue} - ${spack} - ${monitoring} - ${enroot} + - ${pyxis} - ${pam} - ${sacct_limits} @@ -992,6 +992,8 @@ variables: title: "Install Nvidia Pyxis plugin for Slurm" default: false description: "Install Pyxis. Pyxis is a plugin that integrates Enroot with Slurm." + visible: + - ${enroot} rack_aware: type: boolean From a71a30e4dc305e55e9e88a54bc0e5b7f29727840 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 14 Mar 2023 11:39:50 -0700 Subject: [PATCH 116/133] corrected to slurm_backup --- playbooks/resize_remove_unreachable.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 835bd18d..4a5b95e7 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -1,4 +1,4 @@ -- hosts: bastion, compute, backup, login +- hosts: bastion, compute, slurm_backup, login become: true gather_facts: true vars: From 679b26764717bcfada46af048d2203ea328f79ad Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 15 Mar 2023 09:26:41 -0700 Subject: [PATCH 117/133] topology role created. updated to have topology.conf in sync with bastion on backup --- playbooks/destroy.yml | 8 +++- playbooks/new_nodes.yml | 5 +++ playbooks/resize_add.yml | 7 ++- playbooks/resize_remove.yml | 7 ++- playbooks/resize_remove_unreachable.yml | 7 ++- playbooks/roles/slurm/tasks/el7.yml | 12 ++--- playbooks/roles/slurm/tasks/el8.yml | 12 ++--- playbooks/roles/slurm/tasks/ubuntu.yml | 8 ++-- playbooks/roles/topology/.travis.yml | 29 ++++++++++++ playbooks/roles/topology/README.md | 38 ++++++++++++++++ playbooks/roles/topology/defaults/main.yml | 3 ++ playbooks/roles/topology/handlers/main.yml | 2 + playbooks/roles/topology/meta/main.yml | 52 ++++++++++++++++++++++ playbooks/roles/topology/tasks/common.yml | 12 +++++ playbooks/roles/topology/tasks/main.yml | 3 ++ playbooks/roles/topology/tests/inventory | 2 + playbooks/roles/topology/tests/test.yml | 5 +++ playbooks/roles/topology/vars/main.yml | 2 + playbooks/slurm_config_as.yml | 6 ++- 19 files changed, 198 insertions(+), 22 deletions(-) create mode 100644 playbooks/roles/topology/.travis.yml create mode 100644 playbooks/roles/topology/README.md create mode 100644 playbooks/roles/topology/defaults/main.yml create mode 100644 playbooks/roles/topology/handlers/main.yml create mode 100644 playbooks/roles/topology/meta/main.yml create mode 100644 playbooks/roles/topology/tasks/common.yml create mode 100644 playbooks/roles/topology/tasks/main.yml create mode 100644 playbooks/roles/topology/tests/inventory create mode 100644 playbooks/roles/topology/tests/test.yml create mode 100644 playbooks/roles/topology/vars/main.yml diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 2b5ba8cf..30c9843f 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -9,10 +9,14 @@ - include_role: name: slurm when: slurm|default(false)|bool -- hosts: bastion +- hosts: bastion, slurm_backup, login become: true vars: destroy: true initial: false roles: - - etc-hosts \ No newline at end of file + - etc-hosts +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 8c24a80a..8879928d 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -194,3 +194,8 @@ - include_role: name: telegraf when: monitoring|default(false)|bool + +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 11ed903e..4d0347fe 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -195,4 +195,9 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool \ No newline at end of file + when: monitoring|default(false)|bool + +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index 50a694e9..44638478 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -19,4 +19,9 @@ tasks: - include_role: name: slurm - when: slurm|default(false)|bool \ No newline at end of file + when: slurm|default(false)|bool + +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 4a5b95e7..c392b668 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -9,4 +9,9 @@ - "/opt/oci-hpc/conf/queues.conf" tasks: - include_role: - name: destroy_unreachable \ No newline at end of file + name: destroy_unreachable + +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index cd49e199..117771f7 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -19,12 +19,6 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - vars: - slurm_repos: "epel,ol7_developer_EPEL" - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "epel,ol7_developer_EPEL" @@ -37,6 +31,12 @@ include_tasks: login.yml when: ('login' in group_names) and (not destroy|bool) and (initial| bool) +- name: run backup server directives + vars: + slurm_repos: "epel,ol7_developer_EPEL" + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index 5ce029b1..c1cc4c8b 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -19,12 +19,6 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - vars: - slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" @@ -37,6 +31,12 @@ include_tasks: login.yml when: ('login' in group_names) and (not destroy|bool) and (initial| bool) +- name: run backup server directives + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index b54ae961..c65729f7 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -6,10 +6,6 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) @@ -17,6 +13,10 @@ - name: run login server directives include_tasks: login.yml when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + +- name: run backup server directives + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - name: cleanup include_tasks: cleanup.yml diff --git a/playbooks/roles/topology/.travis.yml b/playbooks/roles/topology/.travis.yml new file mode 100644 index 00000000..36bbf620 --- /dev/null +++ b/playbooks/roles/topology/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/playbooks/roles/topology/README.md b/playbooks/roles/topology/README.md new file mode 100644 index 00000000..225dd44b --- /dev/null +++ b/playbooks/roles/topology/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/topology/defaults/main.yml b/playbooks/roles/topology/defaults/main.yml new file mode 100644 index 00000000..6a87fb15 --- /dev/null +++ b/playbooks/roles/topology/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# defaults file for topology +slurm_conf_path: '/etc/slurm' \ No newline at end of file diff --git a/playbooks/roles/topology/handlers/main.yml b/playbooks/roles/topology/handlers/main.yml new file mode 100644 index 00000000..1c4f290d --- /dev/null +++ b/playbooks/roles/topology/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for topology diff --git a/playbooks/roles/topology/meta/main.yml b/playbooks/roles/topology/meta/main.yml new file mode 100644 index 00000000..c572acc9 --- /dev/null +++ b/playbooks/roles/topology/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/playbooks/roles/topology/tasks/common.yml b/playbooks/roles/topology/tasks/common.yml new file mode 100644 index 00000000..8e09347d --- /dev/null +++ b/playbooks/roles/topology/tasks/common.yml @@ -0,0 +1,12 @@ +--- + +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 \ No newline at end of file diff --git a/playbooks/roles/topology/tasks/main.yml b/playbooks/roles/topology/tasks/main.yml new file mode 100644 index 00000000..4c856795 --- /dev/null +++ b/playbooks/roles/topology/tasks/main.yml @@ -0,0 +1,3 @@ +--- +# tasks file for topology +- include: common.yml \ No newline at end of file diff --git a/playbooks/roles/topology/tests/inventory b/playbooks/roles/topology/tests/inventory new file mode 100644 index 00000000..878877b0 --- /dev/null +++ b/playbooks/roles/topology/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/playbooks/roles/topology/tests/test.yml b/playbooks/roles/topology/tests/test.yml new file mode 100644 index 00000000..56f21fdc --- /dev/null +++ b/playbooks/roles/topology/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - topology diff --git a/playbooks/roles/topology/vars/main.yml b/playbooks/roles/topology/vars/main.yml new file mode 100644 index 00000000..8382cde7 --- /dev/null +++ b/playbooks/roles/topology/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for topology diff --git a/playbooks/slurm_config_as.yml b/playbooks/slurm_config_as.yml index 4ccebf99..d3e4e3c2 100755 --- a/playbooks/slurm_config_as.yml +++ b/playbooks/slurm_config_as.yml @@ -15,4 +15,8 @@ tasks: - include_role: name: slurm - when: slurm|default(false)|bool \ No newline at end of file + when: slurm|default(false)|bool +- hosts: slurm_backup + become: true + roles: + - topology \ No newline at end of file From e28a6a186169ff0e5910ae1dbe06301b17482351 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 15 Mar 2023 10:48:14 -0600 Subject: [PATCH 118/133] Fix error when removing 1 node of singlerack clust --- playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index 822745e8..96aa2ffa 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -148,7 +148,7 @@ state: absent with_items: "{{unreachable_slurm_nodes}}" ignore_unreachable: yes - when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 + when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2 run_once: true delegate_to: 127.0.0.1 From d5f646727221e804d1f93350a33956e05ef69b6d Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 15 Mar 2023 10:18:17 -0700 Subject: [PATCH 119/133] removed auto generated text --- playbooks/roles/topology/.travis.yml | 30 +-------------- playbooks/roles/topology/README.md | 38 ------------------ playbooks/roles/topology/meta/main.yml | 53 +------------------------- 3 files changed, 2 insertions(+), 119 deletions(-) diff --git a/playbooks/roles/topology/.travis.yml b/playbooks/roles/topology/.travis.yml index 36bbf620..73b314ff 100644 --- a/playbooks/roles/topology/.travis.yml +++ b/playbooks/roles/topology/.travis.yml @@ -1,29 +1 @@ ---- -language: python -python: "2.7" - -# Use the new container infrastructure -sudo: false - -# Install ansible -addons: - apt: - packages: - - python-pip - -install: - # Install ansible - - pip install ansible - - # Check ansible version - - ansible --version - - # Create ansible.cfg with correct roles_path - - printf '[defaults]\nroles_path=../' >ansible.cfg - -script: - # Basic role syntax check - - ansible-playbook tests/test.yml -i tests/inventory --syntax-check - -notifications: - webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file +--- \ No newline at end of file diff --git a/playbooks/roles/topology/README.md b/playbooks/roles/topology/README.md index 225dd44b..e69de29b 100644 --- a/playbooks/roles/topology/README.md +++ b/playbooks/roles/topology/README.md @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/topology/meta/main.yml b/playbooks/roles/topology/meta/main.yml index c572acc9..73b314ff 100644 --- a/playbooks/roles/topology/meta/main.yml +++ b/playbooks/roles/topology/meta/main.yml @@ -1,52 +1 @@ -galaxy_info: - author: your name - description: your role description - company: your company (optional) - - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - # issue_tracker_url: http://example.com/issue/tracker - - # Choose a valid license ID from https://spdx.org - some suggested licenses: - # - BSD-3-Clause (default) - # - MIT - # - GPL-2.0-or-later - # - GPL-3.0-only - # - Apache-2.0 - # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.1 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 - - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. - -dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. +--- \ No newline at end of file From 0194888d14159b1b75dc441f19e5feb1244abd88 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 17 Mar 2023 15:26:43 -0700 Subject: [PATCH 120/133] removed topology role. added moving of topology.conf to backup from slurm role --- playbooks/destroy.yml | 8 ++------ playbooks/new_nodes.yml | 9 ++------- playbooks/resize_add.yml | 7 +------ playbooks/resize_remove.yml | 9 ++------- playbooks/resize_remove_unreachable.yml | 7 +------ .../destroy_unreachable/tasks/slurm-rack-aware.yml | 12 ++++++++++++ playbooks/roles/destroy_unreachable/tasks/slurm.yml | 12 ++++++++++++ playbooks/roles/slurm/tasks/el7.yml | 4 ++++ playbooks/roles/slurm/tasks/el8.yml | 4 ++++ .../common.yml => slurm/tasks/move-topology.yml} | 0 playbooks/roles/slurm/tasks/ubuntu.yml | 4 ++++ playbooks/roles/topology/.travis.yml | 1 - playbooks/roles/topology/README.md | 0 playbooks/roles/topology/defaults/main.yml | 3 --- playbooks/roles/topology/handlers/main.yml | 2 -- playbooks/roles/topology/meta/main.yml | 1 - playbooks/roles/topology/tasks/main.yml | 3 --- playbooks/roles/topology/tests/inventory | 2 -- playbooks/roles/topology/tests/test.yml | 5 ----- playbooks/roles/topology/vars/main.yml | 2 -- playbooks/slurm_config_as.yml | 8 ++------ 21 files changed, 46 insertions(+), 57 deletions(-) rename playbooks/roles/{topology/tasks/common.yml => slurm/tasks/move-topology.yml} (100%) delete mode 100644 playbooks/roles/topology/.travis.yml delete mode 100644 playbooks/roles/topology/README.md delete mode 100644 playbooks/roles/topology/defaults/main.yml delete mode 100644 playbooks/roles/topology/handlers/main.yml delete mode 100644 playbooks/roles/topology/meta/main.yml delete mode 100644 playbooks/roles/topology/tasks/main.yml delete mode 100644 playbooks/roles/topology/tests/inventory delete mode 100644 playbooks/roles/topology/tests/test.yml delete mode 100644 playbooks/roles/topology/vars/main.yml diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 30c9843f..520b756d 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -1,4 +1,4 @@ -- hosts: compute +- hosts: compute, slurm_backup become: true vars: destroy: true @@ -15,8 +15,4 @@ destroy: true initial: false roles: - - etc-hosts -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + - etc-hosts \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 8879928d..c54b519f 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -173,7 +173,7 @@ name: latency_check when: cluster_network|bool and not 'GPU' in shape -- hosts: compute +- hosts: compute, slurm_backup vars: destroy: false initial: false @@ -193,9 +193,4 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool - -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 4d0347fe..11ed903e 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -195,9 +195,4 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool - -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index 44638478..c75ea9fc 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -9,7 +9,7 @@ - include_role: name: etc-hosts -- hosts: compute_to_destroy +- hosts: compute_to_destroy, slurm_backup become: true vars: destroy: true @@ -19,9 +19,4 @@ tasks: - include_role: name: slurm - when: slurm|default(false)|bool - -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + when: slurm|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index c392b668..4a5b95e7 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -9,9 +9,4 @@ - "/opt/oci-hpc/conf/queues.conf" tasks: - include_role: - name: destroy_unreachable - -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + name: destroy_unreachable \ No newline at end of file diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index 96aa2ffa..4471c98c 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -255,6 +255,18 @@ delegate_to: 127.0.0.1 when: ('bastion' in group_names) +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 + when: ('slurm_backup' in group_names) + - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm.yml b/playbooks/roles/destroy_unreachable/tasks/slurm.yml index ada27290..e06e77a3 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm.yml @@ -145,6 +145,18 @@ delegate_to: 127.0.0.1 when: ('bastion' in group_names) +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 + when: ('slurm_backup' in group_names) + - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index 117771f7..b2be275f 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -44,3 +44,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index c1cc4c8b..d4b2cbbb 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -44,3 +44,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/topology/tasks/common.yml b/playbooks/roles/slurm/tasks/move-topology.yml similarity index 100% rename from playbooks/roles/topology/tasks/common.yml rename to playbooks/roles/slurm/tasks/move-topology.yml diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index c65729f7..96a8843e 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -25,3 +25,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/topology/.travis.yml b/playbooks/roles/topology/.travis.yml deleted file mode 100644 index 73b314ff..00000000 --- a/playbooks/roles/topology/.travis.yml +++ /dev/null @@ -1 +0,0 @@ ---- \ No newline at end of file diff --git a/playbooks/roles/topology/README.md b/playbooks/roles/topology/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/playbooks/roles/topology/defaults/main.yml b/playbooks/roles/topology/defaults/main.yml deleted file mode 100644 index 6a87fb15..00000000 --- a/playbooks/roles/topology/defaults/main.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -# defaults file for topology -slurm_conf_path: '/etc/slurm' \ No newline at end of file diff --git a/playbooks/roles/topology/handlers/main.yml b/playbooks/roles/topology/handlers/main.yml deleted file mode 100644 index 1c4f290d..00000000 --- a/playbooks/roles/topology/handlers/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -# handlers file for topology diff --git a/playbooks/roles/topology/meta/main.yml b/playbooks/roles/topology/meta/main.yml deleted file mode 100644 index 73b314ff..00000000 --- a/playbooks/roles/topology/meta/main.yml +++ /dev/null @@ -1 +0,0 @@ ---- \ No newline at end of file diff --git a/playbooks/roles/topology/tasks/main.yml b/playbooks/roles/topology/tasks/main.yml deleted file mode 100644 index 4c856795..00000000 --- a/playbooks/roles/topology/tasks/main.yml +++ /dev/null @@ -1,3 +0,0 @@ ---- -# tasks file for topology -- include: common.yml \ No newline at end of file diff --git a/playbooks/roles/topology/tests/inventory b/playbooks/roles/topology/tests/inventory deleted file mode 100644 index 878877b0..00000000 --- a/playbooks/roles/topology/tests/inventory +++ /dev/null @@ -1,2 +0,0 @@ -localhost - diff --git a/playbooks/roles/topology/tests/test.yml b/playbooks/roles/topology/tests/test.yml deleted file mode 100644 index 56f21fdc..00000000 --- a/playbooks/roles/topology/tests/test.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -- hosts: localhost - remote_user: root - roles: - - topology diff --git a/playbooks/roles/topology/vars/main.yml b/playbooks/roles/topology/vars/main.yml deleted file mode 100644 index 8382cde7..00000000 --- a/playbooks/roles/topology/vars/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -# vars file for topology diff --git a/playbooks/slurm_config_as.yml b/playbooks/slurm_config_as.yml index d3e4e3c2..f92067be 100755 --- a/playbooks/slurm_config_as.yml +++ b/playbooks/slurm_config_as.yml @@ -3,7 +3,7 @@ tasks: - debug: msg: "Gathering facts" -- hosts: compute +- hosts: compute, slurm_backup gather_facts: true vars: destroy: false @@ -15,8 +15,4 @@ tasks: - include_role: name: slurm - when: slurm|default(false)|bool -- hosts: slurm_backup - become: true - roles: - - topology \ No newline at end of file + when: slurm|default(false)|bool \ No newline at end of file From 0102fbfbdf4c615a0d869d1dda45569b449dd73f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 11:37:01 -0600 Subject: [PATCH 121/133] Add a comment that nodes are space separated --- bin/resize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/resize.py b/bin/resize.py index 50401061..acc7b43a 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -518,7 +518,7 @@ def updateTFState(inventory,cluster_name,size): parser.add_argument('--cluster_name', help='Name of the cluster to resize. Defaults to the name included in the bastion') parser.add_argument('mode', help='Mode type. add/remove node options, implicitly configures newly added nodes. Also implicitly reconfigure/restart services like Slurm to recognize new nodes. Similarly for remove option, terminates nodes and implicitly reconfigure/restart services like Slurm on rest of the cluster nodes to remove reference to deleted nodes.',choices=['add','remove','remove_unreachable','list','reconfigure'],default='list',nargs='?') parser.add_argument('number', type=int, help="Number of nodes to add or delete if a list of hostnames is not defined",nargs='?') -parser.add_argument('--nodes', help="List of nodes to delete",nargs='+') +parser.add_argument('--nodes', help="List of nodes to delete (Space Separated)",nargs='+') parser.add_argument('--no_reconfigure', help='If present. Does not rerun the playbooks',action='store_true',default=False) parser.add_argument('--user_logging', help='If present. Use the default settings in ~/.oci/config to connect to the API. Default is using instance_principal',action='store_true',default=False) parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False) From 38c64984d7eaa8e277de10603abf001d542961b5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 11:37:43 -0600 Subject: [PATCH 122/133] Update tf provider to 4.112 --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index c5c30733..5a50f491 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.110.0" + version = "4.2.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index c5c30733..458fd9db 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.110.0" + version = "4.112.0" } } } \ No newline at end of file From acdbec0fd42a840ca88a6194efba3530fb0f7d97 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 15:55:33 -0600 Subject: [PATCH 123/133] Fix enroot tasks --- autoscaling/tf_init/versions.tf | 2 +- playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml | 4 ++-- playbooks/roles/nvidia-enroot/tasks/ubuntu.yml | 4 ++-- versions.tf | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index c5c30733..5a50f491 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.110.0" + version = "4.2.0" } } } \ No newline at end of file diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml index 55367740..41e0e56b 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml @@ -38,12 +38,12 @@ - name: set_fact: - - enroot_top_path_checked: "/etc/enroot/" + enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: set_fact: - - enroot_top_path_checked: "{{enroot_top_path}}" + enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index e1a94e10..cdbcaa00 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -34,12 +34,12 @@ - name: set_fact: - - enroot_top_path_checked: "/etc/enroot/" + enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: set_fact: - - enroot_top_path_checked: "{{enroot_top_path}}" + enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH diff --git a/versions.tf b/versions.tf index c5c30733..458fd9db 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.110.0" + version = "4.112.0" } } } \ No newline at end of file From 1d60bde8f5ae55404f556ebae1f8730c4fb7b003 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 15:58:12 -0600 Subject: [PATCH 124/133] Fix set_fact error --- playbooks/roles/slurm/tasks/common_pyxis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml index e3881da3..596b1286 100644 --- a/playbooks/roles/slurm/tasks/common_pyxis.yml +++ b/playbooks/roles/slurm/tasks/common_pyxis.yml @@ -2,12 +2,12 @@ - name: set_fact: - - enroot_top_path_checked: "/etc/enroot/" + enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: set_fact: - - enroot_top_path_checked: "{{enroot_top_path}}" + enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: copy files From 662491e4fc5a8397c1f926fcfa469b8b8a590a2a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 16:07:19 -0600 Subject: [PATCH 125/133] Fix the Grafana version to avoid errors --- playbooks/roles/grafana/tasks/el.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/grafana/tasks/el.yml b/playbooks/roles/grafana/tasks/el.yml index 7f38fe90..7172bf96 100755 --- a/playbooks/roles/grafana/tasks/el.yml +++ b/playbooks/roles/grafana/tasks/el.yml @@ -15,8 +15,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum From b195d49ee13289026a252e33dec9bbde15b271b1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 20 Mar 2023 16:56:59 -0600 Subject: [PATCH 126/133] Fix terraform version to 4.112 --- autoscaling/tf_init/versions.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 5a50f491..458fd9db 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.2.0" + version = "4.112.0" } } } \ No newline at end of file From d26442c40e4514076beaad117f2fd5bfab930e7d Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 21 Mar 2023 12:47:04 -0700 Subject: [PATCH 127/133] node ordering without pssh as well --- .../rack-aware/files/node_ordering_by_rack.py | 62 ++++++++++++------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index 6f2446ac..f874595f 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 -from pssh.clients import ParallelSSHClient import json -import sys, getopt import os import argparse -from operator import itemgetter -from collections import OrderedDict +import subprocess def write_ordered_hostfile(ordered_hosts=[],hostfile=None): #ordered_hostfile="ordered_hostfile" @@ -43,28 +40,47 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): #with open('/etc/opt/oci-hpc/hostfile.tcp', 'r') as f: hosts = f.read().splitlines() -client = ParallelSSHClient(hosts) -output = client.run_command('curl http://169.254.169.254/opc/v1/host/') -#print(output) r = {} -for host_out in output: - j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) - #print(j) - if j['rackId'] in r: - r[j['rackId']].append( host_out.host ) - else: - r[j['rackId']] = [ host_out.host ] - - friendly_name_to_system_hostname = {} -hostname_output = client.run_command('/usr/bin/hostname') -#print(hostname_output) -for host_out in hostname_output: - #j = bytearray(''.join(list(host_out.stdout)).encode()) - j = bytearray(''.join(list(host_out.stdout)).encode()) - friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii') - #print(j.decode(encoding='ascii')+" "+host_out.host) +try: + from pssh.clients import ParallelSSHClient + client = ParallelSSHClient(hosts) + output = client.run_command('curl http://169.254.169.254/opc/v1/host/') + #print(output) + for host_out in output: + j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) + #print(j) + if j['rackId'] in r: + r[j['rackId']].append( host_out.host ) + else: + r[j['rackId']] = [ host_out.host ] + hostname_output = client.run_command('/usr/bin/hostname') + #print(hostname_output) + for host_out in hostname_output: + #j = bytearray(''.join(list(host_out.stdout)).encode()) + j = bytearray(''.join(list(host_out.stdout)).encode()) + friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii') + #print(j.decode(encoding='ascii')+" "+host_out.host) +except ImportError: + try: + for h in hosts: + out = subprocess.run(["ssh "+h+" \"curl -s http://169.254.169.254/opc/v1/host/\""],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True) + x = out.stdout.splitlines() + del x[-1] + del x[0] + rackId_str = x[1].split(":")[1].replace('"','') + rackId = rackId_str.replace(' ','') + if rackId in r: + r[rackId].append( h ) + else: + r[rackId] = [ h ] + for h in hosts: + out = subprocess.run(["ssh "+h+" /usr/bin/hostname"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True) + x = out.stdout.splitlines() + friendly_name_to_system_hostname[h] = x[0] + except subprocess.CalledProcessError as e_process_error: + exit(f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") ordered_hosts = [] From a9295aac0da1581c5fd7b7636a4800767d0e0726 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Mar 2023 20:07:39 -0600 Subject: [PATCH 128/133] Set the Grafana version to 8.5.21 --- playbooks/roles/autoscaling_mon/tasks/el.yml | 4 ++-- playbooks/roles/autoscaling_mon/tasks/ubuntu.yml | 4 ++-- playbooks/roles/grafana/tasks/ubuntu.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index 5de05e2d..c14ccd72 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -53,8 +53,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index 11db136f..4f46e0a8 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -65,8 +65,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/grafana/tasks/ubuntu.yml b/playbooks/roles/grafana/tasks/ubuntu.yml index f3e5fc2d..af9fa526 100644 --- a/playbooks/roles/grafana/tasks/ubuntu.yml +++ b/playbooks/roles/grafana/tasks/ubuntu.yml @@ -27,8 +27,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum From 0800e6ba712ac867dd313c1a4bb2a5ff8169b44f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Mar 2023 20:07:55 -0600 Subject: [PATCH 129/133] Fix small issue with destroy race condition --- .../roles/slurm/tasks/destroy-rack-aware.yml | 39 ++++++++++--------- playbooks/roles/slurm/tasks/destroy.yml | 3 +- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index 6c53f912..fb4604d3 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -57,6 +57,24 @@ set_fact: nodes_to_remove="{{nodes_to_remove_temp_results.results | map(attribute='ansible_facts.nodes_to_remove_temp') | list}}" run_once: true +- name: Get new inactive_nodes list + command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}" + register: new_inactive_list + run_once: true + delegate_to: 127.0.0.1 + +- name: Adding nodes to inactive + vars: + - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" + become: true + lineinfile: + path: "{{ slurm_conf_path }}/topology.conf" + regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" + line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}" + state: present + run_once: true + delegate_to: 127.0.0.1 + - name: Run the script to get the RackID shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 @@ -160,26 +178,9 @@ delegate_to: 127.0.0.1 when: racks_left_list | list | length > 0 -- name: Get new inactive_nodes list - command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}" - register: new_inactive_list - run_once: true - delegate_to: 127.0.0.1 - -- name: Adding nodes to inactive - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - become: true - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}" - state: present - run_once: true - delegate_to: 127.0.0.1 - - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" delegate_to: 127.0.0.1 - run_once: true \ No newline at end of file + run_once: true + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index c030419c..1a3906c2 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -120,4 +120,5 @@ become: true command: "scontrol reconfigure" delegate_to: 127.0.0.1 - run_once: true \ No newline at end of file + run_once: true + ignore_errors: true \ No newline at end of file From ad74fe414d246e0674efdfa5164cef7a17d06b83 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 22 Mar 2023 20:52:53 -0600 Subject: [PATCH 130/133] Avoid race condition during multiple deletions --- autoscaling/crontab/autoscale_slurm.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 0ccd87c3..7e2a0aa7 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -357,7 +357,7 @@ try: cluster_name=cluster[0] print ("Deleting cluster "+cluster_name) subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name]) - time.sleep(1) + time.sleep(5) for cluster_name in nodes_to_destroy.keys(): print ("Resizing cluster "+cluster_name) @@ -379,7 +379,6 @@ try: subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes) if len(unreachable_nodes) > 0: subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes) - time.sleep(1) for index,cluster in enumerate(cluster_to_build): From 51f65b51a92469c14f087e1b728dd1e31599d424 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Mar 2023 15:25:42 -0600 Subject: [PATCH 131/133] Add a warning about Pyxis and autoscaling --- schema.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/schema.yaml b/schema.yaml index f199a686..ffa538ec 100755 --- a/schema.yaml +++ b/schema.yaml @@ -984,15 +984,16 @@ variables: default: false required: true description: "Add a second master of the same shape as the bastion as a back-up controller node. We recommend using a FSS to save the state and share between masters" - visible: - - ${slurm} + visible: ${slurm} pyxis: type: boolean title: "Install Nvidia Pyxis plugin for Slurm" default: false - description: "Install Pyxis. Pyxis is a plugin that integrates Enroot with Slurm." + description: "Install Pyxis. Pyxis is a plugin that integrates Enroot with Slurm. (Warning: using Pyxis with autoscaling is causing an issue that prevents jobs from being scheduled on nodes to be spun up)" visible: + and: + - ${slurm} - ${enroot} rack_aware: @@ -1001,8 +1002,7 @@ variables: default: false required: true description: "Slurm topology can define rack aware topologies to prioritize nodes on same racks per job.\n This is a LA feature and your tenancy needs to be whitelisted" - visible: - - ${slurm} + visible: ${slurm} queue: @@ -1011,8 +1011,7 @@ variables: default: "compute" required: true description: "Add the permanent cluster to a specific queue, compute is the default queue" - visible: - - ${slurm} + visible: ${slurm} spack: type: boolean From 08d4cf66bfd9c3194f8db885a1b5099d984983a3 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 23 Mar 2023 15:55:55 -0600 Subject: [PATCH 132/133] Fix topology for slurm_backup and login --- playbooks/roles/slurm/tasks/destroy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index 1a3906c2..406b3b5b 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -55,7 +55,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results From 87caac86b9d806ba8a833247048fcb70ce1cec94 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 24 Mar 2023 16:17:59 -0600 Subject: [PATCH 133/133] Add Slurm condition in schema.yml --- schema.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/schema.yaml b/schema.yaml index ffa538ec..1b22dc25 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1024,18 +1024,21 @@ variables: title: "Install Nvidia Enroot for containerized GPU workloads" default: false description: "Install Enroot, Nvidia Container Toolkit, and docker." + visible: ${slurm} pam: type: boolean title: "Enable PAM" default: false description: "Enable PAM for the Slurm cluster (Supported only on OL with RHCK kernel at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." + visible: ${slurm} sacct_limits: type: boolean title: "Enable Limits for Slurm jobs" default: false description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs of the right limits are not set" + visible: ${slurm} monitoring: type: boolean