From fc71b8ca1933257c3a91b3a2fe9acdfc9872c654 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 24 May 2023 16:50:33 -0700 Subject: [PATCH 01/20] update to latest oci-cn-auth version for new nodes --- playbooks/new_nodes.yml | 6 ++++ playbooks/resize_add.yml | 6 ++++ playbooks/roles/oci-cn-auth/defaults/main.yml | 2 ++ playbooks/roles/oci-cn-auth/tasks/common.yml | 34 +++++++++++++++++++ playbooks/roles/oci-cn-auth/tasks/main.yml | 1 + playbooks/roles/safe_yum/tasks/ubuntu.yml | 2 +- 6 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 playbooks/roles/oci-cn-auth/defaults/main.yml create mode 100644 playbooks/roles/oci-cn-auth/tasks/common.yml create mode 100644 playbooks/roles/oci-cn-auth/tasks/main.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index b971b9d5..f55f04bf 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -42,6 +42,12 @@ name: localdisk when: localdisk | default(true) | bool +- hosts: compute + tasks: + - include_role: + name: oci-cn-auth + when: cluster_network|bool + - hosts: compute become: true gather_facts: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index c0288eb7..f0a99ad5 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -40,6 +40,12 @@ name: localdisk when: localdisk | default(true) | bool +- hosts: compute_to_add + tasks: + - include_role: + name: oci-cn-auth + when: cluster_network|bool + - hosts: compute_to_add become: true gather_facts: true diff --git a/playbooks/roles/oci-cn-auth/defaults/main.yml b/playbooks/roles/oci-cn-auth/defaults/main.yml new file mode 100644 index 00000000..49371d08 --- /dev/null +++ b/playbooks/roles/oci-cn-auth/defaults/main.yml @@ -0,0 +1,2 @@ +current_version: 2.1.4-compute +download_link: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/ \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/common.yml b/playbooks/roles/oci-cn-auth/tasks/common.yml new file mode 100644 index 00000000..02578f2e --- /dev/null +++ b/playbooks/roles/oci-cn-auth/tasks/common.yml @@ -0,0 +1,34 @@ +--- +- name: Check the oci-cn-auth version + shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth + register: version + +- name: Download current oci-cn-auth .deb if it is not the current version + get_url: + url: "{{download_link}}oci-cn-auth_{{current_version}}_all.deb" + dest: "/tmp/" + when: ansible_os_family == 'Debian' and version.stdout != current_version + +- name: Install current oci-cn-auth .deb if it is not the current version + vars: + deb_name: + - "/tmp/oci-cn-auth_{{current_version}}_all.deb" + package_state: present + include_role: + name: safe_yum + when: ansible_os_family == 'Debian' and version.stdout != current_version + +- name: Download current oci-cn-auth .rpm if it is not the current version + get_url: + url: "{{download_link}}oci-cn-auth-{{current_version}}.el{{ansible_distribution_major_version}}.noarch.rpm" + dest: "/tmp/" + when: ansible_os_family == 'RedHat' and version.stdout != current_version + +- name: Install current oci-cn-auth .rpm if it is not the current version + vars: + package_name: + - "/tmp/oci-cn-auth-{{current_version}}.el{{ansible_distribution_major_version}}.noarch.rpm" + package_state: present + include_role: + name: safe_yum + when: ansible_os_family == 'RedHat' and version.stdout != current_version \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/main.yml b/playbooks/roles/oci-cn-auth/tasks/main.yml new file mode 100644 index 00000000..d9f88ea9 --- /dev/null +++ b/playbooks/roles/oci-cn-auth/tasks/main.yml @@ -0,0 +1 @@ +- include: common.yml \ No newline at end of file diff --git a/playbooks/roles/safe_yum/tasks/ubuntu.yml b/playbooks/roles/safe_yum/tasks/ubuntu.yml index 6ad15a88..bb8d47b4 100755 --- a/playbooks/roles/safe_yum/tasks/ubuntu.yml +++ b/playbooks/roles/safe_yum/tasks/ubuntu.yml @@ -20,7 +20,7 @@ delay: 5 when: not deb_name is defined -- name: "Installing/Removing {{package_name}}" +- name: "Installing/Removing {{deb_name}}" become: true apt: deb: "{{item}}" From 5cf8de530b31051c30f31d9214b104b487f8df72 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 24 May 2023 17:58:47 -0700 Subject: [PATCH 02/20] only update oci-cn-auth if the current version is lower --- playbooks/roles/oci-cn-auth/defaults/main.yml | 2 +- playbooks/roles/oci-cn-auth/tasks/common.yml | 34 ------------------- playbooks/roles/oci-cn-auth/tasks/el.yml | 19 +++++++++++ playbooks/roles/oci-cn-auth/tasks/main.yml | 7 +++- playbooks/roles/oci-cn-auth/tasks/ubuntu.yml | 19 +++++++++++ 5 files changed, 45 insertions(+), 36 deletions(-) delete mode 100644 playbooks/roles/oci-cn-auth/tasks/common.yml create mode 100644 playbooks/roles/oci-cn-auth/tasks/el.yml create mode 100644 playbooks/roles/oci-cn-auth/tasks/ubuntu.yml diff --git a/playbooks/roles/oci-cn-auth/defaults/main.yml b/playbooks/roles/oci-cn-auth/defaults/main.yml index 49371d08..3fcb32ac 100644 --- a/playbooks/roles/oci-cn-auth/defaults/main.yml +++ b/playbooks/roles/oci-cn-auth/defaults/main.yml @@ -1,2 +1,2 @@ -current_version: 2.1.4-compute +version: 2.1.4 download_link: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/ \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/common.yml b/playbooks/roles/oci-cn-auth/tasks/common.yml deleted file mode 100644 index 02578f2e..00000000 --- a/playbooks/roles/oci-cn-auth/tasks/common.yml +++ /dev/null @@ -1,34 +0,0 @@ ---- -- name: Check the oci-cn-auth version - shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth - register: version - -- name: Download current oci-cn-auth .deb if it is not the current version - get_url: - url: "{{download_link}}oci-cn-auth_{{current_version}}_all.deb" - dest: "/tmp/" - when: ansible_os_family == 'Debian' and version.stdout != current_version - -- name: Install current oci-cn-auth .deb if it is not the current version - vars: - deb_name: - - "/tmp/oci-cn-auth_{{current_version}}_all.deb" - package_state: present - include_role: - name: safe_yum - when: ansible_os_family == 'Debian' and version.stdout != current_version - -- name: Download current oci-cn-auth .rpm if it is not the current version - get_url: - url: "{{download_link}}oci-cn-auth-{{current_version}}.el{{ansible_distribution_major_version}}.noarch.rpm" - dest: "/tmp/" - when: ansible_os_family == 'RedHat' and version.stdout != current_version - -- name: Install current oci-cn-auth .rpm if it is not the current version - vars: - package_name: - - "/tmp/oci-cn-auth-{{current_version}}.el{{ansible_distribution_major_version}}.noarch.rpm" - package_state: present - include_role: - name: safe_yum - when: ansible_os_family == 'RedHat' and version.stdout != current_version \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/el.yml b/playbooks/roles/oci-cn-auth/tasks/el.yml new file mode 100644 index 00000000..3b84d17a --- /dev/null +++ b/playbooks/roles/oci-cn-auth/tasks/el.yml @@ -0,0 +1,19 @@ +--- +- name: Check the oci-cn-auth version + shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}' + register: current_version + +- name: Download oci-cn-auth .rpm if the current version is lower + get_url: + url: "{{download_link}}oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm" + dest: "/tmp/" + when: current_version.stdout < version + +- name: Install oci-cn-auth .rpm if the current version is lower + vars: + package_name: + - "/tmp/oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm" + package_state: present + include_role: + name: safe_yum + when: current_version.stdout < version \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/main.yml b/playbooks/roles/oci-cn-auth/tasks/main.yml index d9f88ea9..8705dde4 100644 --- a/playbooks/roles/oci-cn-auth/tasks/main.yml +++ b/playbooks/roles/oci-cn-auth/tasks/main.yml @@ -1 +1,6 @@ -- include: common.yml \ No newline at end of file +--- +- include: el.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' + +- include: ubuntu.yml + when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml b/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml new file mode 100644 index 00000000..797aa382 --- /dev/null +++ b/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml @@ -0,0 +1,19 @@ +--- +- name: Check the oci-cn-auth version + shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}' + register: current_version + +- name: Download oci-cn-auth .deb if the current version is lower + get_url: + url: "{{download_link}}oci-cn-auth_{{version}}-compute_all.deb" + dest: "/tmp/" + when: current_version.stdout < version + +- name: Install oci-cn-auth .deb if the current version is lower + vars: + deb_name: + - "/tmp/oci-cn-auth_{{version}}-compute_all.deb" + package_state: present + include_role: + name: safe_yum + when: current_version.stdout < version \ No newline at end of file From a0ef0c00403c213e124041d24dcb943498ec57fc Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 24 May 2023 22:51:29 -0700 Subject: [PATCH 03/20] updated new_nodes.yml and resize_add.yml --- playbooks/new_nodes.yml | 7 ++----- playbooks/resize_add.yml | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index f55f04bf..cda52ba4 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -43,15 +43,12 @@ when: localdisk | default(true) | bool - hosts: compute + become: true + gather_facts: true tasks: - include_role: name: oci-cn-auth when: cluster_network|bool - -- hosts: compute - become: true - gather_facts: true - tasks: - include_role: name: rdma-interface when: cluster_network|bool diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index f0a99ad5..10942468 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -41,15 +41,12 @@ when: localdisk | default(true) | bool - hosts: compute_to_add + become: true + gather_facts: true tasks: - include_role: name: oci-cn-auth when: cluster_network|bool - -- hosts: compute_to_add - become: true - gather_facts: true - tasks: - include_role: name: rdma-interface when: cluster_network|bool From ae2b2092969921a91a32b3b6a9ae974f0d82e6eb Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 6 Jun 2023 11:44:52 -0700 Subject: [PATCH 04/20] update files for PAM to work on Ubuntu --- playbooks/roles/slurm/tasks/compute_pam.yml | 63 +++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/playbooks/roles/slurm/tasks/compute_pam.yml b/playbooks/roles/slurm/tasks/compute_pam.yml index 0e4a29ff..876e1b5a 100644 --- a/playbooks/roles/slurm/tasks/compute_pam.yml +++ b/playbooks/roles/slurm/tasks/compute_pam.yml @@ -8,12 +8,75 @@ +:wheel:ALL +:opc:ALL -:ALL:ALL + when: ansible_os_family == 'RedHat' + +- name: Edit /etc/security/access.conf + become: true + blockinfile: + dest: /etc/security/access.conf + block: | + +:root:ALL + +:wheel:ALL + +:ubuntu:ALL + -:ALL:ALL + when: ansible_distribution == 'Ubuntu' - name: Copy sshd file become: true copy: src: sshd dest: /etc/pam.d/sshd + when: ansible_os_family == 'RedHat' + +- name: Add required pam account permission + become: true + lineinfile: + path: /etc/pam.d/sshd + line: "account required /usr/local/lib/security/pam_slurm_adopt.so" + state: present + backup: yes + insertafter: EOF + when: ansible_distribution == 'Ubuntu' + +- name: Comment pam_systemd.so in /etc/pam.d/common-session + become: true + lineinfile: + path: /etc/pam.d/common-session + regexp: 'pam_systemd.so$' + line: "#session optional pam_systemd.so" + state: present + backup: yes + when: ansible_distribution == 'Ubuntu' + +- name: Comment pam_systemd.so in /etc/pam.d/runuser-l + become: true + lineinfile: + path: /etc/pam.d/runuser-l + regexp: 'pam_systemd.so$' + line: "#-session optional pam_systemd.so" + state: present + backup: yes + when: ansible_distribution == 'Ubuntu' + +- name: Comment pam_systemd.so in /etc/pam.d/systemd-user + become: true + lineinfile: + path: /etc/pam.d/systemd-user + regexp: 'pam_systemd.so$' + line: "#session optional pam_systemd.so" + state: present + backup: yes + when: ansible_distribution == 'Ubuntu' + +- name: Comment pam_access.so in /etc/pam.d/common-auth + become: true + lineinfile: + path: /etc/pam.d/common-auth + regexp: 'pam_access.so$' + line: "#account required pam_access.so" + state: present + backup: yes + when: ansible_distribution == 'Ubuntu' - name: Stop logind become: true From 77e3c97d526a3c652148684bdcf11db2306b603b Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 6 Jun 2023 12:20:00 -0700 Subject: [PATCH 05/20] update comments in schema and fix typo in README --- README.md | 2 +- schema.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eb1771e2..38e0da78 100644 --- a/README.md +++ b/README.md @@ -350,7 +350,7 @@ validate -g y -cn --> This will run the GPU throttle check. validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. -validate -e y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. +validate -e y -cn --> This will run the /etc/hosts md5 sum check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. validate -e --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line. diff --git a/schema.yaml b/schema.yaml index a13ae931..5df82cfd 100755 --- a/schema.yaml +++ b/schema.yaml @@ -236,7 +236,7 @@ variables: ldap: type: boolean title: "Configure LDAP authentication from bastion" - description: "When selected nodes will be configured to use LDAP authentication. User and group management can be performed using cluster commands. (Not working on Ubuntu, yet...)" + description: "When selected nodes will be configured to use LDAP authentication. User and group management can be performed using cluster commands." default: true cluster_name: title: "Name of the cluster" @@ -1032,7 +1032,7 @@ variables: type: boolean title: "Enable PAM" default: false - description: "Enable PAM for the Slurm cluster (Supported only on OL with RHCK kernel at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." + description: "Enable PAM for the Slurm cluster (Supported on OL with RHCK kernel and Ubuntu at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." visible: ${slurm} sacct_limits: From 880f128807c37ea484e1c6612a38d361b71cbe0c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 7 Jun 2023 10:36:18 -0600 Subject: [PATCH 06/20] Add oci-cn-auth role before RDMA role. --- playbooks/site.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/playbooks/site.yml b/playbooks/site.yml index 5cc30597..7f3dc2c5 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -48,6 +48,9 @@ become: true gather_facts: true tasks: + - include_role: + name: oci-cn-auth + when: cluster_network|bool - include_role: name: rdma-interface when: cluster_network|bool From 92628e58d154f33f08d2d79f0437680370168406 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 7 Jun 2023 10:36:43 -0600 Subject: [PATCH 07/20] Change default to login node and rackaware --- schema.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/schema.yaml b/schema.yaml index 5df82cfd..f010d449 100755 --- a/schema.yaml +++ b/schema.yaml @@ -261,14 +261,14 @@ variables: dependsOn: compartmentId: ${targetCompartment} required: true - default: VM.Standard2.4 + default: VM.Standard.E4.Flex bastion_ocpus: title: "Cores" type: integer description: Number of OCPU's for flex shape minimum: 1 maximum: 64 - default: 2 + default: 4 visible: and: - or: @@ -334,7 +334,7 @@ variables: description: Number of memory for flex shape. Minimum 1GB per core. minimum: 1 maximum: 1024 - default: 16 + default: 32 visible: and: - and: @@ -1001,7 +1001,7 @@ variables: rack_aware: type: boolean title: "Create Rack aware topology" - default: false + default: true required: true description: "Slurm topology can define rack aware topologies to prioritize nodes on same racks per job.\n This is a LA feature and your tenancy needs to be whitelisted" visible: ${slurm} @@ -1271,7 +1271,7 @@ variables: login_node: type: boolean title: "Login Node" - default: false + default: true description: "Create an additional login node for users" login_ad: @@ -1292,7 +1292,7 @@ variables: dependsOn: compartmentId: ${targetCompartment} required: true - default: VM.Standard2.4 + default: VM.Standard.E4.Flex visible: ${login_node} login_ocpus: @@ -1300,7 +1300,7 @@ variables: description: Number of OCPU's for flex shape minimum: 1 maximum: 64 - default: 2 + default: 16 visible: and: - or: @@ -1369,7 +1369,7 @@ variables: description: Number of memory for flex shape. Minimum 1GB per core. minimum: 1 maximum: 1024 - default: 16 + default: 128 visible: and: - and: @@ -1399,7 +1399,7 @@ variables: required: true minimum: 50 title: "Size of the boot volume in GB" - default: 50 + default: 250 visible: ${login_node} login_block: @@ -1426,7 +1426,7 @@ variables: - "0. Lower performance" - "10. Balanced performance" - "20. High Performance" - default: "10. Balanced performance" + default: "20. High Performance" visible: and: - and: From cffd0cde141b14175c2b81c722da24a73efee647 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 7 Jun 2023 16:19:39 -0600 Subject: [PATCH 08/20] Change Default instance-pool shape --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index f010d449..f2443aec 100755 --- a/schema.yaml +++ b/schema.yaml @@ -522,7 +522,7 @@ variables: instance_pool_shape: title: "Shape of the Compute Nodes" required: true - default: "VM.Standard2.4" + default: "VM.Standard.E4.Flex" type: oci:core:instanceshape:name dependsOn: compartmentId: ${targetCompartment} From 1aa0b04bda4a0a965a2dfde32b3b753d6388d635 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 8 Jun 2023 10:37:31 -0700 Subject: [PATCH 09/20] add --mca coll ^hcoll by default as its required for all OFED 5.4 and newer images --- samples/gpu/nccl_run_allreduce.sh | 1 + samples/gpu/nccl_run_alltoall.sh | 1 + samples/gpu/qfabv1_nccl_run_allreduce.sbatch | 2 ++ samples/gpu/qfabv1_nccl_run_allreduce.sh | 2 ++ samples/gpu/qfabv1_nccl_run_alltoall.sh | 2 ++ 5 files changed, 8 insertions(+) diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh index 850a7900..7548ede3 100644 --- a/samples/gpu/nccl_run_allreduce.sh +++ b/samples/gpu/nccl_run_allreduce.sh @@ -68,6 +68,7 @@ fi # final version mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ -x NCCL_IB_TC=41 \ diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh index 23a37cbf..cc8c7bec 100644 --- a/samples/gpu/nccl_run_alltoall.sh +++ b/samples/gpu/nccl_run_alltoall.sh @@ -75,6 +75,7 @@ fi # final version mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_MAX_P2P_NCHANNELS=16 \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch index b78684a2..fbc0eb29 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch @@ -53,8 +53,10 @@ then var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16" fi +# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ -x NCCL_IB_TC=41 \ diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh index 28b3afdb..82062b34 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sh +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sh @@ -66,9 +66,11 @@ then var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16" fi +# you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer # final version mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ -x NCCL_IB_TC=41 \ diff --git a/samples/gpu/qfabv1_nccl_run_alltoall.sh b/samples/gpu/qfabv1_nccl_run_alltoall.sh index dd7975f4..1419d68a 100644 --- a/samples/gpu/qfabv1_nccl_run_alltoall.sh +++ b/samples/gpu/qfabv1_nccl_run_alltoall.sh @@ -78,8 +78,10 @@ fi # Use NCCL_IB_QPS_PER_CONNECTION=4 for QFAB1.0, should get around 15GB/s NCCL Bus BW. # Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss # final version + # you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_MAX_P2P_NCHANNELS=16 \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ From e6d22b651ad2f1f6ea6c9a0e4055c7c826630716 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 8 Jun 2023 10:38:59 -0700 Subject: [PATCH 10/20] add comment --- samples/gpu/nccl_run_alltoall.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh index cc8c7bec..7591cc36 100644 --- a/samples/gpu/nccl_run_alltoall.sh +++ b/samples/gpu/nccl_run_alltoall.sh @@ -73,6 +73,7 @@ fi # Use -x NCCL_MAX_P2P_NCHANNELS=16 until NCCL 2.12 release which has a fix to allow NCCL_MAX_P2P_NCHANNELS=32 for nodes with 16 RDMA NICss # final version + # you need --mca coll ^hcoll when using an image that has OFED 5.4 or newer mpirun --mca pml ucx \ --bind-to numa \ --mca coll ^hcoll \ From 0f66271bdf0e859eebeccb917a22a1af549525be Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 8 Jun 2023 17:18:39 -0700 Subject: [PATCH 11/20] changed grafana version to 8.5.21 for ubuntu and for OL changed the location to get the 8.5.21-1 package --- autoscaling/tf_init/inventory.tpl | 1 + conf/variables.tpl | 1 + playbooks/new_nodes.yml | 3 ++ playbooks/resize_add.yml | 3 ++ playbooks/roles/autoscaling_mon/tasks/el.yml | 30 +----------- .../roles/autoscaling_mon/tasks/ubuntu.yml | 46 +------------------ playbooks/roles/grafana/tasks/el.yml | 2 +- playbooks/roles/grafana/tasks/ubuntu.yml | 19 ++++---- 8 files changed, 21 insertions(+), 84 deletions(-) diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 4b7811ba..9ae1cd02 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -60,6 +60,7 @@ log_vol=${log_vol} ldap=${ldap} queue=${queue} instance_type=${instance_type} +monitoring=${monitoring} hyperthreading=${hyperthreading} privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} diff --git a/conf/variables.tpl b/conf/variables.tpl index bffb4b65..1072dbaa 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -116,6 +116,7 @@ variable "hyperthreading" { default = ##HT## } variable "unsupported" { default = ${unsupported} } variable "image_ocid" { default = "##IMAGE##" } variable "ldap" { default = ${ldap} } +variable "monitoring" { default = ${monitoring} } variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} } diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index cda52ba4..4ddab322 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -203,6 +203,9 @@ - include_role: name: slurm when: slurm|default(false)|bool + - include_role: + name: influxdb + when: monitoring|default(false)|bool - include_role: name: telegraf when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 10942468..2f9149fb 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -205,6 +205,9 @@ - include_role: name: slurm when: slurm|default(false)|bool + - include_role: + name: influxdb + when: monitoring|default(false)|bool - include_role: name: telegraf when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index c14ccd72..5c37fdfc 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -51,35 +51,9 @@ sslcacert: /etc/pki/tls/certs/ca-bundle.crt - name: install grafana - vars: - package_name: - - grafana-8.5.21-1 - package_state: present include_role: - name: safe_yum - -- name: start grafana - become: true - service: - name: grafana-server - state: restarted - enabled: true - -- name: Ensure grafana key directory exists - file: - path: "/etc/opt/oci-hpc/passwords/grafana" - state: directory - delegate_to: localhost - -- name: Check api key list - uri: - url: "{{ grafana_api_url }}/api/auth/keys" - user: "{{ grafana_security.admin_user }}" - password: "{{ grafana_security.admin_password }}" - force_basic_auth: true - return_content: true - no_log: false - register: existing_api_keys + name: grafana + when: not monitoring|default(false)|bool - name: install mysql-shell and connector vars: diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index 4f46e0a8..64020bc0 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -50,52 +50,10 @@ # sslverify: 1 # sslcacert: /etc/pki/tls/certs/ca-bundle.crt -- name: Add grafana key - become: true - apt_key: - state: present - url: https://packages.grafana.com/gpg.key - -- name: Manage grafana APT repositories - become: true - apt_repository: - repo: deb https://packages.grafana.com/oss/deb stable main - state: present - - name: install grafana - vars: - package_name: - - grafana-8.5.21-1 - package_state: present include_role: - name: safe_yum - -- name: start grafana - become: true - service: - name: grafana-server - state: restarted - enabled: true - -- name: Ensure grafana key directory exists - file: - path: "/etc/opt/oci-hpc/passwords/grafana" - state: directory - delegate_to: localhost - -- name: Check api key list - uri: - url: "{{ grafana_api_url }}/api/auth/keys" - method: GET - user: "{{ grafana_security.admin_user }}" - password: "{{ grafana_security.admin_password }}" - force_basic_auth: true - return_content: true - no_log: false - register: existing_api_keys - retries: 5 - delay: 5 - until: existing_api_keys is not failed + name: grafana + when: not monitoring|default(false)|bool # - name: Import mysql-2022 key # become: true diff --git a/playbooks/roles/grafana/tasks/el.yml b/playbooks/roles/grafana/tasks/el.yml index 7172bf96..d90ef113 100755 --- a/playbooks/roles/grafana/tasks/el.yml +++ b/playbooks/roles/grafana/tasks/el.yml @@ -15,7 +15,7 @@ - name: install grafana vars: package_name: - - grafana-8.5.21-1 + - https://dl.grafana.com/oss/release/grafana-8.5.21-1.x86_64.rpm package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/grafana/tasks/ubuntu.yml b/playbooks/roles/grafana/tasks/ubuntu.yml index af9fa526..45b9ac83 100644 --- a/playbooks/roles/grafana/tasks/ubuntu.yml +++ b/playbooks/roles/grafana/tasks/ubuntu.yml @@ -18,20 +18,17 @@ state: present url: https://packages.grafana.com/gpg.key -- name: Manage grafana APT repositories +- name: Download grafana 8.5.21 package + get_url: + url: https://dl.grafana.com/oss/release/grafana_8.5.21_amd64.deb + dest: /tmp/grafana_8.5.21_amd64.deb + +- name: Install grafana 8.5.21 package become: true - apt_repository: - repo: deb https://packages.grafana.com/oss/deb stable main + ansible.builtin.apt: + deb: /tmp/grafana_8.5.21_amd64.deb state: present -- name: install grafana - vars: - package_name: - - grafana-8.5.21-1 - package_state: present - include_role: - name: safe_yum - - name: start grafana become: true service: From 8860afbac4ff63449ffc2fd987091bad5cf18b0e Mon Sep 17 00:00:00 2001 From: anoopna Date: Fri, 9 Jun 2023 15:17:58 +0530 Subject: [PATCH 12/20] Update common.yml --- playbooks/roles/nvidia_peermem/tasks/common.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index 51971505..946ab7e7 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -6,20 +6,22 @@ register: shape failed_when: false - - name: Check if nvidia drivers are installed shell: cat /sys/module/nvidia/version | wc -l register: nvidia when: shape.stdout != "" - - name: Check if nvidia_peermem module is loaded shell: lsmod | grep nvidia_peermem | wc -l register: result when: shape.stdout != "" and nvidia.stdout == '1' +- name: check ofed version + shell: ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F \"(\" '{print $2}'|cut -c 6-|awk -F \"-\" '{print $1}' + register: ofed_version + when: shape.stdout != "" and nvidia.stdout == '1' - name: Load nvidia_peermem module become: true shell: modprobe nvidia_peermem - when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' \ No newline at end of file + when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version.stdout|int >= '5.1' \ No newline at end of file From b9d79517886fd44a6f56bfebba458c697574266a Mon Sep 17 00:00:00 2001 From: anoopna Date: Fri, 9 Jun 2023 15:58:04 +0530 Subject: [PATCH 13/20] Update common.yml --- playbooks/roles/nvidia_peermem/tasks/common.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index 946ab7e7..bf7fc324 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -16,8 +16,10 @@ register: result when: shape.stdout != "" and nvidia.stdout == '1' -- name: check ofed version - shell: ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F \"(\" '{print $2}'|cut -c 6-|awk -F \"-\" '{print $1}' +- name: Check ofed version + shell: + cmd: | + /usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}' register: ofed_version when: shape.stdout != "" and nvidia.stdout == '1' From 77dfdc65d4a0beb7b6bc74869eb45b4ffa5bdfaa Mon Sep 17 00:00:00 2001 From: anoopna Date: Fri, 9 Jun 2023 20:55:25 +0530 Subject: [PATCH 14/20] Update common.yml --- playbooks/roles/nvidia_peermem/tasks/common.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index bf7fc324..09a595c4 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -20,10 +20,10 @@ shell: cmd: | /usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}' - register: ofed_version + register: ofed_version_local when: shape.stdout != "" and nvidia.stdout == '1' - name: Load nvidia_peermem module become: true shell: modprobe nvidia_peermem - when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version.stdout|int >= '5.1' \ No newline at end of file + when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1' \ No newline at end of file From 9be56efddcebe2773f5657a47fafacdfbeb11faf Mon Sep 17 00:00:00 2001 From: anoopna Date: Fri, 9 Jun 2023 21:25:01 +0530 Subject: [PATCH 15/20] Update common.yml --- playbooks/roles/nvidia_peermem/tasks/common.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index 09a595c4..d5764cd8 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -3,27 +3,27 @@ shell: cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU" warn: false - register: shape + register: shape_gpu failed_when: false - name: Check if nvidia drivers are installed shell: cat /sys/module/nvidia/version | wc -l register: nvidia - when: shape.stdout != "" + when: shape_gpu.stdout != "" - name: Check if nvidia_peermem module is loaded shell: lsmod | grep nvidia_peermem | wc -l register: result - when: shape.stdout != "" and nvidia.stdout == '1' + when: shape_gpu.stdout != "" and nvidia.stdout == '1' - name: Check ofed version shell: cmd: | /usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}' register: ofed_version_local - when: shape.stdout != "" and nvidia.stdout == '1' + when: shape_gpu.stdout != "" and nvidia.stdout == '1' - name: Load nvidia_peermem module become: true shell: modprobe nvidia_peermem - when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1' \ No newline at end of file + when: shape_gpu.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1' \ No newline at end of file From 65e3e13737cbd8a163086ff88ee85d679b78cee6 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 9 Jun 2023 11:34:41 -0700 Subject: [PATCH 16/20] fix for adding autoscaling nodes to cluster dashboard as well. removing nodes via resizing or autoscaling still pending. --- autoscaling/tf_init/bastion_update.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 58904dbd..df579eb0 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -62,6 +62,7 @@ resource "local_file" "inventory" { instance_pool_ocpus=local.instance_pool_ocpus, queue=var.queue, instance_type=var.instance_type, + monitoring=var.monitoring, autoscaling_monitoring = var.autoscaling_monitoring, unsupported = var.unsupported, hyperthreading = var.hyperthreading, From c16bd62c231e93a9ee46099542d70160ae0bb83e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 9 Jun 2023 15:56:02 -0600 Subject: [PATCH 17/20] Delete Lock file before update --- playbooks/roles/fix_broken/tasks/ubuntu.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/fix_broken/tasks/ubuntu.yml b/playbooks/roles/fix_broken/tasks/ubuntu.yml index a522df45..cc8aae1b 100644 --- a/playbooks/roles/fix_broken/tasks/ubuntu.yml +++ b/playbooks/roles/fix_broken/tasks/ubuntu.yml @@ -72,6 +72,14 @@ delay: 10 until: result.stdout | int == 0 +- name: Ensure lock file is removed + become: true + file: + path: "/var/lib/apt/lists/lock" + state: absent + retries: 30 + delay: 10 + until: result.stdout | int == 0 - name: Run apt-get update become: true @@ -80,7 +88,6 @@ PID1=$! wait $PID1 - - name: Run fix-broken become: true shell: | From 08e96aa871142fb7709d55712407fbec92cf0a9c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 12 Jun 2023 23:18:30 -0600 Subject: [PATCH 18/20] Fix Flex shape when not selected --- schema.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/schema.yaml b/schema.yaml index f2443aec..c128bfcc 100755 --- a/schema.yaml +++ b/schema.yaml @@ -540,6 +540,8 @@ variables: default: 2 visible: and: + - not: + - ${cluster_network} - or: - eq: - ${instance_pool_shape} @@ -581,6 +583,8 @@ variables: default: false visible: and: + - not: + - ${cluster_network} - or: - eq: - ${instance_pool_shape} @@ -625,6 +629,7 @@ variables: - "VM.Standard3.Flex" - and: - ${instance_pool_custom_memory} + - ${cluster_network} required: true node_count: From 1ba9b90f29427d8a100b8d8c6de3fdd3de01d370 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 12 Jun 2023 23:18:50 -0600 Subject: [PATCH 19/20] Restart oci-cn-auth if installing --- playbooks/roles/oci-cn-auth/tasks/el.yml | 8 ++++++++ playbooks/roles/oci-cn-auth/tasks/ubuntu.yml | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/playbooks/roles/oci-cn-auth/tasks/el.yml b/playbooks/roles/oci-cn-auth/tasks/el.yml index 3b84d17a..1bcc7241 100644 --- a/playbooks/roles/oci-cn-auth/tasks/el.yml +++ b/playbooks/roles/oci-cn-auth/tasks/el.yml @@ -16,4 +16,12 @@ package_state: present include_role: name: safe_yum + when: current_version.stdout < version + +- name: Restart the OCI CN AUTH service + become: true + service: + name: oci-cn-auth + state: restarted + enabled: yes when: current_version.stdout < version \ No newline at end of file diff --git a/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml b/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml index 797aa382..da50d1ff 100644 --- a/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml +++ b/playbooks/roles/oci-cn-auth/tasks/ubuntu.yml @@ -16,4 +16,12 @@ package_state: present include_role: name: safe_yum + when: current_version.stdout < version + +- name: Restart the OCI CN AUTH service + become: true + service: + name: oci-cn-auth + state: restarted + enabled: yes when: current_version.stdout < version \ No newline at end of file From b7635750b0ae69c8f4b03630c5cd48e319ccef9f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 14 Jun 2023 16:07:56 -0600 Subject: [PATCH 20/20] Update to latest terraform provider (5.1.0) --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 577f3255..6dd2b529 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.115.0" + version = "5.1.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index 577f3255..6dd2b529 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.115.0" + version = "5.1.0" } } } \ No newline at end of file