Skip to content

Commit

Permalink
Merge pull request #31 from oracle-quickstart/2.10.4
Browse files Browse the repository at this point in the history
2.10.4
  • Loading branch information
arnaudfroidmont authored Jan 5, 2024
2 parents f0499b7 + 5b6dd34 commit 7f6f274
Show file tree
Hide file tree
Showing 83 changed files with 1,575 additions and 383 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ sleep 1000

- Instance Type: You can specify the OCI instance type that you’d like to run on as a constraint. This will make sure that you run on the right shape and also generate the right cluster. Instance types are defined in the `/opt/oci-hpc/conf/queues.conf` file in yml format. Leave all of the field in there even if they are not used. You can define multiple queues and multiple instance type in each queue. If you do not select an instance type when creating your job, it will use the default one.

- cpu-bind: On Ubuntu 22.04, we are switching to Cgroup v2 and we did notice that when hyperthreading is turned off. The default cpu-bind may give some issues. If you get an error like `error: task_g_set_affinity: Invalid argument`, you can try running your job with --cpu-bind=none or --cpu-bind=sockets
## Clusters folders:
```
/opt/oci-hpc/autoscaling/clusters/clustername
Expand Down
4 changes: 2 additions & 2 deletions autoscaling/crontab/autoscale_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def getTopology(clusterName):
# Get the list of Jobs in all states
def getJobs():
# changing the position of Dependency as it is giving blank instead of null. to handle that, putting it at the end.
out = subprocess.Popen(['squeue','-O','STATE,JOBID,FEATURE:100,NUMNODES,Partition,UserName,Dependency'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
out = subprocess.Popen(['squeue','-r','-O','STATE,JOBID,FEATURE:100,NUMNODES,Partition,UserName,Dependency'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
stdout,stderr = out.communicate()
return stdout.split("\n")[1:]

Expand Down Expand Up @@ -433,4 +433,4 @@ try:

except Exception:
traceback.print_exc()
os.remove(lockfile)
os.remove(lockfile)
5 changes: 4 additions & 1 deletion autoscaling/tf_init/bastion_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ resource "local_file" "inventory" {
spack = var.spack,
ldap = var.ldap,
bastion_block = var.bastion_block,
login_block = var.login_block,
scratch_nfs_type = local.scratch_nfs_type,
bastion_mount_ip = var.bastion_mount_ip,
login_mount_ip = var.login_mount_ip,
cluster_mount_ip = local.mount_ip,
cluster_name = local.cluster_name,
shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
Expand All @@ -72,7 +74,8 @@ resource "local_file" "inventory" {
bastion_username = var.bastion_username,
compute_username = var.compute_username,
pam = var.pam,
sacct_limits = var.sacct_limits
sacct_limits = var.sacct_limits,
use_compute_agent=var.use_compute_agent
})
filename = "${local.bastion_path}/inventory"
}
Expand Down
26 changes: 25 additions & 1 deletion autoscaling/tf_init/cluster-network-configuration.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,32 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
user_data = base64encode(data.template_file.config.rendered)
}
agent_config {
is_management_disabled = true

are_all_plugins_disabled = false
is_management_disabled = true
is_monitoring_disabled = false

plugins_config {
desired_state = "DISABLED"
name = "OS Management Service Agent"
}
dynamic plugins_config {

for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"]
content {
name = "Compute HPC RDMA Authentication"
desired_state = plugins_config.value
}
}
dynamic plugins_config {
for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"]
content {
name = "Compute HPC RDMA Auto-Configuration"
desired_state = plugins_config.value
}

}
}
shape = var.cluster_network_shape
source_details {
source_type = "image"
Expand Down
5 changes: 4 additions & 1 deletion autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ pyxis = ${pyxis}
enroot = ${enroot}
spack = ${spack}
bastion_block = ${bastion_block}
login_block = ${login_block}
scratch_nfs_type = ${scratch_nfs_type}
bastion_mount_ip = ${bastion_mount_ip}
login_mount_ip = ${login_mount_ip}
cluster_mount_ip = ${cluster_mount_ip}
autoscaling = true
force = no
Expand All @@ -68,4 +70,5 @@ latency_check=${latency_check}
compute_username=${compute_username}
bastion_username=${bastion_username}
pam = ${pam}
sacct_limits=${sacct_limits}
sacct_limits=${sacct_limits}
use_compute_agent=${use_compute_agent}
4 changes: 2 additions & 2 deletions autoscaling/tf_init/marketplace.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
locals {
mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing]
mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : ""
mp_version_id = var.marketplace_version_id[var.marketplace_listing]
}

/*
Expand Down
46 changes: 30 additions & 16 deletions bastion.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ resource "oci_core_volume_attachment" "bastion_volume_attachment" {
instance_id = oci_core_instance.bastion.id
display_name = "${local.cluster_name}-bastion-volume-attachment"
device = "/dev/oracleoci/oraclevdb"
is_shareable = true
}

resource "oci_core_volume_backup_policy" "bastion_boot_volume_backup_policy" {
Expand Down Expand Up @@ -200,7 +201,7 @@ resource "null_resource" "bastion" {
}

provisioner "file" {
content = tls_private_key.ssh.private_key_pem
content = tls_private_key.ssh.private_key_openssh
destination = "/home/${var.bastion_username}/.ssh/cluster.key"
connection {
host = local.host
Expand All @@ -209,6 +210,17 @@ resource "null_resource" "bastion" {
private_key = tls_private_key.ssh.private_key_pem
}
}

provisioner "file" {
content = tls_private_key.ssh.public_key_openssh
destination = "/home/${var.bastion_username}/.ssh/id_rsa.pub"
connection {
host = local.host
type = "ssh"
user = var.bastion_username
private_key = tls_private_key.ssh.private_key_pem
}
}
}
resource "null_resource" "cluster" {
depends_on = [null_resource.bastion, null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ]
Expand Down Expand Up @@ -246,14 +258,17 @@ resource "null_resource" "cluster" {
log_vol = var.log_vol,
redundancy = var.redundancy,
cluster_network = var.cluster_network,
use_compute_agent = var.use_compute_agent,
slurm = var.slurm,
rack_aware = var.rack_aware,
slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path
spack = var.spack,
ldap = var.ldap,
bastion_block = var.bastion_block,
login_block = var.login_block,
scratch_nfs_type = local.scratch_nfs_type,
bastion_mount_ip = local.bastion_mount_ip,
login_mount_ip = local.login_mount_ip,
cluster_mount_ip = local.mount_ip,
autoscaling = var.autoscaling,
cluster_name = local.cluster_name,
Expand Down Expand Up @@ -324,11 +339,11 @@ resource "null_resource" "cluster" {
provisioner "file" {
content = templatefile("${path.module}/queues.conf", {
cluster_network = var.cluster_network,
use_compute_agent = var.use_compute_agent,
compute_cluster = var.compute_cluster,
marketplace_listing = var.use_old_marketplace_image ? var.old_marketplace_listing : var.marketplace_listing,
marketplace_listing = var.marketplace_listing,
image = local.image_ocid,
use_marketplace_image = var.use_marketplace_image,
use_old_marketplace_image = var.use_old_marketplace_image,
boot_volume_size = var.boot_volume_size,
shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape,
region = var.region,
Expand Down Expand Up @@ -376,8 +391,10 @@ resource "null_resource" "cluster" {
spack = var.spack,
ldap = var.ldap,
bastion_block = var.bastion_block,
login_block = var.login_block,
scratch_nfs_type = local.scratch_nfs_type,
bastion_mount_ip = local.bastion_mount_ip,
login_mount_ip = local.login_mount_ip,
cluster_mount_ip = local.mount_ip,
scratch_nfs_type_cluster = var.scratch_nfs_type_cluster,
scratch_nfs_type_pool = var.scratch_nfs_type_pool,
Expand All @@ -390,8 +407,6 @@ resource "null_resource" "cluster" {
ssh_cidr = var.ssh_cidr,
use_cluster_nfs = var.use_cluster_nfs,
cluster_nfs_path = var.cluster_nfs_path,
bastion_block = var.bastion_block,
bastion_mount_ip = local.bastion_mount_ip,
home_nfs = var.home_nfs,
create_fss = var.create_fss,
home_fss = var.home_fss,
Expand All @@ -417,7 +432,8 @@ resource "null_resource" "cluster" {
bastion_username = var.bastion_username,
compute_username = var.compute_username,
pam = var.pam,
sacct_limits = var.sacct_limits
sacct_limits = var.sacct_limits,
use_compute_agent = var.use_compute_agent
})

destination = "/opt/oci-hpc/conf/variables.tf"
Expand Down Expand Up @@ -467,6 +483,7 @@ provisioner "file" {
"chmod a+x /opt/oci-hpc/bin/*.sh",
"timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh",
"chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh",
"chmod 755 /opt/oci-hpc/samples/*.sh",
"chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem",
"echo ${var.configure} > /tmp/configure.conf",
"timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log",
Expand All @@ -487,30 +504,27 @@ data "oci_objectstorage_namespace" "compartment_namespace" {
}

locals {
rdma_nic_metric_bucket_name = "RDMA_NIC_metrics"
current_timestamp = timestamp()
current_timestamp_formatted = formatdate("YYYYMMDDhhmmss", local.current_timestamp)
rdma_nic_metric_bucket_name = format("%s_%s","RDMA_NIC_metrics",local.current_timestamp_formatted)
par_path = ".."
}
/*
saving the PAR into file: ../PAR_file_for_metrics.
this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh)
*/

data "oci_objectstorage_bucket" "RDMA_NIC_Metrics_bucket_check" {
name = local.rdma_nic_metric_bucket_name
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
}


resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" {
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
count = (var.bastion_object_storage_par) ? 1 : 0
compartment_id = var.targetCompartment
name = local.rdma_nic_metric_bucket_name
namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace
versioning = "Enabled"
}

resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" {
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
count = (var.bastion_object_storage_par) ? 1 : 0
depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket]
access_type = "AnyObjectWrite"
bucket = local.rdma_nic_metric_bucket_name
Expand All @@ -522,12 +536,12 @@ resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" {

output "RDMA_NIC_metrics_url" {
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
value = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : ""
value = (var.bastion_object_storage_par) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : ""
}


resource "local_file" "PAR" {
count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0
count = (var.bastion_object_storage_par) ? 1 : 0
depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par]
content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}"
filename = "${local.par_path}/PAR_file_for_metrics"
Expand Down
64 changes: 46 additions & 18 deletions bin/bastion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
sudo mkdir /etc/ansible
sudo ln -s /usr/local/bin/ansible-playbook /bin/ansible-playbook
sudo ln -s /usr/local/bin/ansible /bin/ansible
sudo python3 -m pip install -U pip
sudo python3 -m pip install netaddr --upgrade
sudo python3 -m pip install setuptools_rust --upgrade
sudo python3 -m pip install requests --upgrade
sudo python3 -m pip install urllib3 --upgrade
fi
sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo
sudo yum install -y terraform
sudo python3 -m pip install oci-cli --upgrade
sudo python3 -m pip install -U pip
sudo python3 -m pip install netaddr --upgrade
sudo python3 -m pip install setuptools_rust --upgrade
sudo python3 -m pip install requests --upgrade
sudo python3 -m pip install urllib3 --upgrade
sudo python3 -m pip install oci-cli --upgrade


elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
Expand Down Expand Up @@ -91,24 +91,41 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
sudo apt -y --fix-broken install

fix_apt


sudo add-apt-repository --yes --update ppa:ansible/ansible
sudo apt-get -y install ansible
output=$?
if [ $output -ne 0 ]
then
fix_apt
sleep 60s
sudo apt-get -y install ansible
fi
fix_apt
sudo apt-get -y install python python-netaddr python3 python3-pip

if [ $ID == "ubuntu" ] && [ $VERSION_ID == "22.04" ] ; then
sudo sed -i 's/#$nrconf{restart} = '"'"'i'"'"';/$nrconf{restart} = '"'"'a'"'"';/g' /etc/needrestart/needrestart.conf
sudo apt-get -y install python3 python3-netaddr python3-pip
sudo ln -s /usr/bin/python3 /usr/bin/python
else
sudo apt-get -y install python python-netaddr python3 python3-pip
fi
output=$?
if [ $output -ne 0 ]
then
fix_apt
sudo apt-get -y install python python-netaddr python3 python3-pip
if [ $ID == "ubuntu" ] && [ $VERSION_ID == "22.04" ] ; then
sudo apt-get -y install python3 python3-netaddr python3-pip

else
sudo apt-get -y install python python-netaddr python3 python3-pip
fi
fi
fix_apt

sudo python3 -m pip install -U pip
sudo python3 -m pip install netaddr --upgrade
sudo python3 -m pip install requests --upgrade
sudo python3 -m pip install urllib3 --upgrade
pip install pip --upgrade
pip install pyopenssl --upgrade

Expand Down Expand Up @@ -153,11 +170,22 @@ ansible-galaxy collection install community.crypto --force > /dev/null
threads=$(nproc)
forks=$(($threads * 8))

sudo sed -i "s/^#forks.*/forks = ${forks}/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#fact_caching=.*/fact_caching=jsonfile/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#fact_caching_connection.*/fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#retries.*/retries=5/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg
sudo sed -i "s/^#command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg
if [ ! -d /etc/ansible ] ; then
sudo mkdir /etc/ansible
if [ $ID == "ubuntu" ] ; then
sudo chown ubuntu:ubuntu /etc/ansible
else
sudo chown opc:opc /etc/ansible
fi
fi

ansible-config init --disabled -t all | sudo tee /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)forks.*/forks = ${forks}/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)fact_caching=.*/fact_caching=jsonfile/" /etc/ansible/ansible.cfg
sudo sed -i "0,/^\(#\|;\)fact_caching_connection.*/s//fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)retries.*/retries=5/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg
sudo sed -i "s/^\(#\|;\)command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg

3 changes: 1 addition & 2 deletions bin/create_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ targetCompartment=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_typ
ADNames=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.ad " $queues_conf`
boot_volume_size=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.boot_volume_size " $queues_conf`
use_marketplace_image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.use_marketplace_image " $queues_conf`
use_old_marketplace_image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.use_old_marketplace_image " $queues_conf`
image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.image " $queues_conf`
instance_pool_ocpus=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.instance_pool_ocpus " $queues_conf`
instance_pool_memory=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.instance_pool_memory " $queues_conf`
Expand All @@ -68,7 +67,7 @@ do

echo $1 $3 $4 >> currently_building
echo $3 $4 > cluster_options
sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##USEOLDMP##~${use_old_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf
sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf

echo "Started to build $2"
start=`date -u +%s`
Expand Down
Loading

0 comments on commit 7f6f274

Please sign in to comment.