Skip to content

Commit

Permalink
Merge pull request #26 from oracle-quickstart/2.10.2.1
Browse files Browse the repository at this point in the history
2.10.2.1
  • Loading branch information
arnaudfroidmont authored Jun 14, 2023
2 parents 3595386 + b763575 commit 763d350
Show file tree
Hide file tree
Showing 27 changed files with 205 additions and 108 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ validate -g y -cn <cluster name file> --> This will run the GPU throttle check.

validate -g <gpu check host file> --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line.

validate -e y -cn <cluster name file> --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file.
validate -e y -cn <cluster name file> --> This will run the /etc/hosts md5 sum check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file.

validate -e <md5 sum check host file> --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line.

Expand Down
1 change: 1 addition & 0 deletions autoscaling/tf_init/bastion_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ resource "local_file" "inventory" {
instance_pool_ocpus=local.instance_pool_ocpus,
queue=var.queue,
instance_type=var.instance_type,
monitoring=var.monitoring,
autoscaling_monitoring = var.autoscaling_monitoring,
unsupported = var.unsupported,
hyperthreading = var.hyperthreading,
Expand Down
1 change: 1 addition & 0 deletions autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ log_vol=${log_vol}
ldap=${ldap}
queue=${queue}
instance_type=${instance_type}
monitoring=${monitoring}
hyperthreading=${hyperthreading}
privilege_sudo=${privilege_sudo}
privilege_group_name=${privilege_group_name}
Expand Down
2 changes: 1 addition & 1 deletion autoscaling/tf_init/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "4.115.0"
version = "5.1.0"
}
}
}
1 change: 1 addition & 0 deletions conf/variables.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ variable "hyperthreading" { default = ##HT## }
variable "unsupported" { default = ${unsupported} }
variable "image_ocid" { default = "##IMAGE##" }
variable "ldap" { default = ${ldap} }
variable "monitoring" { default = ${monitoring} }
variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} }


Expand Down
6 changes: 6 additions & 0 deletions playbooks/new_nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
become: true
gather_facts: true
tasks:
- include_role:
name: oci-cn-auth
when: cluster_network|bool
- include_role:
name: rdma-interface
when: cluster_network|bool
Expand Down Expand Up @@ -200,6 +203,9 @@
- include_role:
name: slurm
when: slurm|default(false)|bool
- include_role:
name: influxdb
when: monitoring|default(false)|bool
- include_role:
name: telegraf
when: monitoring|default(false)|bool
6 changes: 6 additions & 0 deletions playbooks/resize_add.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
become: true
gather_facts: true
tasks:
- include_role:
name: oci-cn-auth
when: cluster_network|bool
- include_role:
name: rdma-interface
when: cluster_network|bool
Expand Down Expand Up @@ -202,6 +205,9 @@
- include_role:
name: slurm
when: slurm|default(false)|bool
- include_role:
name: influxdb
when: monitoring|default(false)|bool
- include_role:
name: telegraf
when: monitoring|default(false)|bool
30 changes: 2 additions & 28 deletions playbooks/roles/autoscaling_mon/tasks/el.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,9 @@
sslcacert: /etc/pki/tls/certs/ca-bundle.crt

- name: install grafana
vars:
package_name:
- grafana-8.5.21-1
package_state: present
include_role:
name: safe_yum

- name: start grafana
become: true
service:
name: grafana-server
state: restarted
enabled: true

- name: Ensure grafana key directory exists
file:
path: "/etc/opt/oci-hpc/passwords/grafana"
state: directory
delegate_to: localhost

- name: Check api key list
uri:
url: "{{ grafana_api_url }}/api/auth/keys"
user: "{{ grafana_security.admin_user }}"
password: "{{ grafana_security.admin_password }}"
force_basic_auth: true
return_content: true
no_log: false
register: existing_api_keys
name: grafana
when: not monitoring|default(false)|bool

- name: install mysql-shell and connector
vars:
Expand Down
46 changes: 2 additions & 44 deletions playbooks/roles/autoscaling_mon/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,52 +50,10 @@
# sslverify: 1
# sslcacert: /etc/pki/tls/certs/ca-bundle.crt

- name: Add grafana key
become: true
apt_key:
state: present
url: https://packages.grafana.com/gpg.key

- name: Manage grafana APT repositories
become: true
apt_repository:
repo: deb https://packages.grafana.com/oss/deb stable main
state: present

- name: install grafana
vars:
package_name:
- grafana-8.5.21-1
package_state: present
include_role:
name: safe_yum

- name: start grafana
become: true
service:
name: grafana-server
state: restarted
enabled: true

- name: Ensure grafana key directory exists
file:
path: "/etc/opt/oci-hpc/passwords/grafana"
state: directory
delegate_to: localhost

- name: Check api key list
uri:
url: "{{ grafana_api_url }}/api/auth/keys"
method: GET
user: "{{ grafana_security.admin_user }}"
password: "{{ grafana_security.admin_password }}"
force_basic_auth: true
return_content: true
no_log: false
register: existing_api_keys
retries: 5
delay: 5
until: existing_api_keys is not failed
name: grafana
when: not monitoring|default(false)|bool

# - name: Import mysql-2022 key
# become: true
Expand Down
9 changes: 8 additions & 1 deletion playbooks/roles/fix_broken/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@
delay: 10
until: result.stdout | int == 0

- name: Ensure lock file is removed
become: true
file:
path: "/var/lib/apt/lists/lock"
state: absent
retries: 30
delay: 10
until: result.stdout | int == 0

- name: Run apt-get update
become: true
Expand All @@ -80,7 +88,6 @@
PID1=$!
wait $PID1
- name: Run fix-broken
become: true
shell: |
Expand Down
2 changes: 1 addition & 1 deletion playbooks/roles/grafana/tasks/el.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
- name: install grafana
vars:
package_name:
- grafana-8.5.21-1
- https://dl.grafana.com/oss/release/grafana-8.5.21-1.x86_64.rpm
package_state: present
include_role:
name: safe_yum
Expand Down
19 changes: 8 additions & 11 deletions playbooks/roles/grafana/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,17 @@
state: present
url: https://packages.grafana.com/gpg.key

- name: Manage grafana APT repositories
- name: Download grafana 8.5.21 package
get_url:
url: https://dl.grafana.com/oss/release/grafana_8.5.21_amd64.deb
dest: /tmp/grafana_8.5.21_amd64.deb

- name: Install grafana 8.5.21 package
become: true
apt_repository:
repo: deb https://packages.grafana.com/oss/deb stable main
ansible.builtin.apt:
deb: /tmp/grafana_8.5.21_amd64.deb
state: present

- name: install grafana
vars:
package_name:
- grafana-8.5.21-1
package_state: present
include_role:
name: safe_yum

- name: start grafana
become: true
service:
Expand Down
16 changes: 10 additions & 6 deletions playbooks/roles/nvidia_peermem/tasks/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@
shell:
cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU"
warn: false
register: shape
register: shape_gpu
failed_when: false


- name: Check if nvidia drivers are installed
shell: cat /sys/module/nvidia/version | wc -l
register: nvidia
when: shape.stdout != ""

when: shape_gpu.stdout != ""

- name: Check if nvidia_peermem module is loaded
shell: lsmod | grep nvidia_peermem | wc -l
register: result
when: shape.stdout != "" and nvidia.stdout == '1'
when: shape_gpu.stdout != "" and nvidia.stdout == '1'

- name: Check ofed version
shell:
cmd: |
/usr/bin/ofed_info |grep MLNX_OFED_LINUX|grep -v rpm|awk -F "(" '{print $2}'|cut -c 6-|awk -F "-" '{print $1}'
register: ofed_version_local
when: shape_gpu.stdout != "" and nvidia.stdout == '1'

- name: Load nvidia_peermem module
become: true
shell: modprobe nvidia_peermem
when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3'
when: shape_gpu.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' and ofed_version_local.stdout|int >= '5.1'
2 changes: 2 additions & 0 deletions playbooks/roles/oci-cn-auth/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
version: 2.1.4
download_link: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/
27 changes: 27 additions & 0 deletions playbooks/roles/oci-cn-auth/tasks/el.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
- name: Check the oci-cn-auth version
shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}'
register: current_version

- name: Download oci-cn-auth .rpm if the current version is lower
get_url:
url: "{{download_link}}oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm"
dest: "/tmp/"
when: current_version.stdout < version

- name: Install oci-cn-auth .rpm if the current version is lower
vars:
package_name:
- "/tmp/oci-cn-auth-{{version}}-compute.el{{ansible_distribution_major_version}}.noarch.rpm"
package_state: present
include_role:
name: safe_yum
when: current_version.stdout < version

- name: Restart the OCI CN AUTH service
become: true
service:
name: oci-cn-auth
state: restarted
enabled: yes
when: current_version.stdout < version
6 changes: 6 additions & 0 deletions playbooks/roles/oci-cn-auth/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
- include: el.yml
when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux'

- include: ubuntu.yml
when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu'
27 changes: 27 additions & 0 deletions playbooks/roles/oci-cn-auth/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
- name: Check the oci-cn-auth version
shell: cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth | awk -F- '{print $1}'
register: current_version

- name: Download oci-cn-auth .deb if the current version is lower
get_url:
url: "{{download_link}}oci-cn-auth_{{version}}-compute_all.deb"
dest: "/tmp/"
when: current_version.stdout < version

- name: Install oci-cn-auth .deb if the current version is lower
vars:
deb_name:
- "/tmp/oci-cn-auth_{{version}}-compute_all.deb"
package_state: present
include_role:
name: safe_yum
when: current_version.stdout < version

- name: Restart the OCI CN AUTH service
become: true
service:
name: oci-cn-auth
state: restarted
enabled: yes
when: current_version.stdout < version
2 changes: 1 addition & 1 deletion playbooks/roles/safe_yum/tasks/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
delay: 5
when: not deb_name is defined

- name: "Installing/Removing {{package_name}}"
- name: "Installing/Removing {{deb_name}}"
become: true
apt:
deb: "{{item}}"
Expand Down
Loading

0 comments on commit 763d350

Please sign in to comment.