From c9806de923a4fa6645c8069cfebfe4b4036ba9d6 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 26 Sep 2023 15:58:17 -0600 Subject: [PATCH 01/68] Change Bastion image selection --- autoscaling/tf_init/marketplace.tf | 4 +- bastion.tf | 3 +- bin/create_cluster.sh | 3 +- conf/queues.conf.example | 4 - conf/variables.tpl | 7 +- locals.tf | 4 +- marketplace.tf | 24 ++-- queues.conf | 2 - schema.yaml | 190 ++++++++--------------------- slurm_ha.tf | 1 - variables.tf | 30 ++--- 11 files changed, 80 insertions(+), 192 deletions(-) diff --git a/autoscaling/tf_init/marketplace.tf b/autoscaling/tf_init/marketplace.tf index 721a1c89..69d46f93 100755 --- a/autoscaling/tf_init/marketplace.tf +++ b/autoscaling/tf_init/marketplace.tf @@ -1,6 +1,6 @@ locals { - mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing] + mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_version_id = var.marketplace_version_id[var.marketplace_listing] } /* diff --git a/bastion.tf b/bastion.tf index fbfed806..e4313779 100644 --- a/bastion.tf +++ b/bastion.tf @@ -325,10 +325,9 @@ resource "null_resource" "cluster" { content = templatefile("${path.module}/queues.conf", { cluster_network = var.cluster_network, compute_cluster = var.compute_cluster, - marketplace_listing = var.use_old_marketplace_image ? var.old_marketplace_listing : var.marketplace_listing, + marketplace_listing = var.marketplace_listing, image = local.image_ocid, use_marketplace_image = var.use_marketplace_image, - use_old_marketplace_image = var.use_old_marketplace_image, boot_volume_size = var.boot_volume_size, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, region = var.region, diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index 2529a124..df2bb3d5 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -42,7 +42,6 @@ targetCompartment=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_typ ADNames=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.ad " $queues_conf` boot_volume_size=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.boot_volume_size " $queues_conf` use_marketplace_image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.use_marketplace_image " $queues_conf` -use_old_marketplace_image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.use_old_marketplace_image " $queues_conf` image=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.image " $queues_conf` instance_pool_ocpus=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.instance_pool_ocpus " $queues_conf` instance_pool_memory=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.instance_pool_memory " $queues_conf` @@ -68,7 +67,7 @@ do echo $1 $3 $4 >> currently_building echo $3 $4 > cluster_options - sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##USEOLDMP##~${use_old_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf + sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf echo "Started to build $2" start=`date -u +%s` diff --git a/conf/queues.conf.example b/conf/queues.conf.example index d75dc088..fe7d2aa0 100644 --- a/conf/queues.conf.example +++ b/conf/queues.conf.example @@ -21,7 +21,6 @@ targetCompartment: ocid1.compartment.oc1.. #TO EDIT boot_volume_size: 50 use_marketplace_image: true - use_old_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 16 instance_pool_custom_memory: false @@ -45,7 +44,6 @@ targetCompartment: ocid1.compartment.oc1.. #TO EDIT boot_volume_size: 50 use_marketplace_image: true - use_old_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 16 instance_pool_custom_memory: false @@ -71,7 +69,6 @@ targetCompartment: ocid1.compartment.oc1.. #TO EDIT boot_volume_size: 50 use_marketplace_image: false - use_old_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 16 instance_pool_custom_memory: false @@ -94,7 +91,6 @@ targetCompartment: ocid1.compartment.oc1.. #TO EDIT boot_volume_size: 50 use_marketplace_image: false - use_old_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 2 instance_pool_custom_memory: true diff --git a/conf/variables.tpl b/conf/variables.tpl index 71ffd5cb..29d47f15 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -15,7 +15,6 @@ variable "instance_type" {default = "##INST_TYPE##" } variable "node_count" { default="##NODES##" } variable "boot_volume_size" {default = "##BOOT##"} variable "use_marketplace_image" { default = "##USEMP##" } -variable "use_old_marketplace_image" { default = "##USEOLDMP##" } variable "scratch_nfs_path" { default = "${scratch_nfs_path}" } variable "use_scratch_nfs" { default = ${use_scratch_nfs} } variable "cluster_nfs_path" {default = "${cluster_nfs_path}"} @@ -55,10 +54,8 @@ variable "marketplace_version_id" { "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" - "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "GPU_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "GPU_OL8" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" } } diff --git a/locals.tf b/locals.tf index 38bac32c..95db99fd 100755 --- a/locals.tf +++ b/locals.tf @@ -28,9 +28,9 @@ locals { cluster_name = var.use_custom_name ? var.cluster_name : random_pet.name.id - bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid + bastion_image = var.use_marketplace_image_bastion ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid - login_image = var.login_node && ( var.use_standard_image_login || var.use_marketplace_image_login ) ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + login_image = var.login_node && var.use_marketplace_image_login ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid diff --git a/marketplace.tf b/marketplace.tf index 5917390b..c434af50 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -1,11 +1,11 @@ locals { // listing_number = split(".", var.marketplace_listing)[0] - mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_bastion_listing_id = var.use_standard_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" - mp_login_listing_id = var.use_marketplace_image_login ? var.use_old_marketplace_image_login ? var.old_marketplace_listing_id : substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" - mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing] - mp_bastion_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] - mp_login_version_id = var.use_old_marketplace_image_login ? var.marketplace_version_id[split(".", var.old_marketplace_listing_login)[0]] : var.marketplace_version_id[var.marketplace_listing_login] + mp_listing_id = var.use_marketplace_image ? substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_bastion_listing_id = var.use_marketplace_image_bastion ? substr(var.marketplace_listing_bastion,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_login_listing_id = var.use_marketplace_image_login ? substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" + mp_version_id = var.marketplace_version_id[var.marketplace_listing] + mp_bastion_version_id = var.marketplace_version_id[var.marketplace_listing_bastion] + mp_login_version_id = var.marketplace_version_id[var.marketplace_listing_login] } /* @@ -49,12 +49,12 @@ resource "oci_core_app_catalog_subscription" "mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "bastion_app_catalog_listing_resource_versions" { - count = var.use_standard_image ? 1 : 0 + count = var.use_marketplace_image_bastion ? 1 : 0 listing_id = local.mp_bastion_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "bastion_mp_image_agreement" { - count = var.use_standard_image ? 1 : 0 + count = ( var.use_marketplace_image_bastion ) ? 1 : 0 listing_id = local.mp_bastion_listing_id listing_resource_version = local.mp_bastion_version_id @@ -62,7 +62,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "bastion_mp_i } resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { - count = var.use_standard_image ? 1 : 0 + count = ( var.use_marketplace_image_bastion ) ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.bastion_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.bastion_mp_image_agreement[0].listing_id @@ -77,12 +77,12 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { } data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { - count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + count = var.login_node && var.use_marketplace_image_login ? 1 : 0 listing_id = local.mp_login_listing_id } resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { - count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + count = var.login_node && var.use_marketplace_image_login ? 1 : 0 listing_id = local.mp_login_listing_id listing_resource_version = local.mp_login_version_id @@ -90,7 +90,7 @@ resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_ima } resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { - count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + count = var.login_node && var.use_marketplace_image_login ? 1 : 0 compartment_id = var.targetCompartment eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id diff --git a/queues.conf b/queues.conf index 2c43b0a2..1f985922 100644 --- a/queues.conf +++ b/queues.conf @@ -21,7 +21,6 @@ targetCompartment: ${targetCompartment} boot_volume_size: ${boot_volume_size} use_marketplace_image: ${use_marketplace_image} - use_old_marketplace_image: ${use_old_marketplace_image} instance_pool_ocpus: ${instance_pool_ocpus} instance_pool_memory: ${instance_pool_memory} instance_pool_custom_memory: ${instance_pool_custom_memory} @@ -45,7 +44,6 @@ targetCompartment: ${targetCompartment} boot_volume_size: ${boot_volume_size} use_marketplace_image: ${use_marketplace_image} - use_old_marketplace_image: ${use_old_marketplace_image} instance_pool_ocpus: ${instance_pool_ocpus} instance_pool_memory: ${instance_pool_memory} instance_pool_custom_memory: ${instance_pool_custom_memory} diff --git a/schema.yaml b/schema.yaml index 42a9890d..e69ae82c 100755 --- a/schema.yaml +++ b/schema.yaml @@ -57,10 +57,8 @@ variableGroups: - ${hyperthreading} - ${boot_volume_size} - ${use_marketplace_image} - - ${use_old_marketplace_image} - ${compute_username} - ${marketplace_listing} - - ${old_marketplace_listing} - ${unsupported} - ${compute_image_compartment} - ${image} @@ -75,11 +73,8 @@ variableGroups: - ${login_custom_memory} - ${login_memory} - ${login_boot_volume_size} - - ${use_standard_image_login} - ${use_marketplace_image_login} - - ${use_old_marketplace_image_login} - ${marketplace_listing_login} - - ${old_marketplace_listing_login} - ${unsupported_login} - ${login_image_compartment} - ${custom_login_image} @@ -121,7 +116,8 @@ variableGroups: - ${fss_ad} - title: "Advanced bastion options" variables: - - ${use_standard_image} + - ${use_marketplace_image_bastion} + - ${marketplace_listing_bastion} - ${unsupported_bastion} - ${bastion_image_compartment} - ${custom_bastion_image} @@ -184,7 +180,6 @@ variableGroups: - ${scratch_nfs_export} - ${scratch_nfs_mount} - ${marketplace_listing_id} - - ${old_marketplace_listing_id} - ${marketplace_listing_id_GPU} - ${marketplace_listing_id_HPC} - ${ssh_cidr} @@ -373,27 +368,36 @@ variables: to Object Storage and share the URL with OCI service teams." type: boolean default: true - use_standard_image: - type: boolean - title: "use standard bastion image" - description: > - "Use standard bastion image (Oracle Linux)" - default: true - visible: true unsupported_bastion: title: "Use unsupported image" description: "Custom image ID for Bastion" type: boolean default: false - visible: - or: - - not: - - ${use_standard_image} - - not: - - eq: - - ${compute_username} - - "opc" + visible: + not: + - ${use_marketplace_image_bastion} + + use_marketplace_image_bastion: + type: boolean + title: "use marketplace image" + description: "Use marketplace image, otherwise provide custom image OCID" + default: true + visible: true + + + marketplace_listing_bastion: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "HPC_OL7" + - "HPC_OL8" + - "GPU_OL7" + - "GPU_OL8" + default: "HPC_OL7" + visible: ${use_marketplace_image_bastion} bastion_username: title: "Default username for bastion" @@ -401,29 +405,14 @@ variables: type: string default: "opc" required: true - visible: - or: - - not: - - ${use_standard_image} - - not: - - eq: - - ${compute_username} - - "opc" + visible: true unsupported_bastion_image: title: "Image OCID" description: "Custom image ID for compute nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment." type: string required: true - visible: - and: - - or: - - not: - - ${use_standard_image} - - not: - - eq: - - ${compute_username} - - "opc" + visible: ${unsupported_bastion} default: "image.ocid" bastion_image_compartment: @@ -432,15 +421,10 @@ variables: default: ${targetCompartment} visible: and: - - or: - - not: - - ${use_standard_image} - - not: - - eq: - - ${compute_username} - - "opc" - not: - ${unsupported_bastion} + - not: + - ${use_marketplace_image_bastion} required: true custom_bastion_image: @@ -449,17 +433,12 @@ variables: type: oci:core:image:id dependsOn: compartmentId: ${bastion_image_compartment} - visible: + visible: and: - - or: - - not: - - ${use_standard_image} - - not: - - eq: - - ${compute_username} - - "opc" - not: - - ${unsupported_bastion} + - ${unsupported_bastion} + - not: + - ${use_marketplace_image_bastion} required: true bastion_boot_volume_size: @@ -735,15 +714,6 @@ variables: description: "Use marketplace image, otherwise provide custom image OCID" default: true - use_old_marketplace_image: - type: boolean - title: "use older marketplace images" - description: "Images prior to September 2021" - default: false - visible: - and: - - ${use_marketplace_image} - marketplace_listing: type: enum title: "Image version" @@ -752,29 +722,10 @@ variables: enum: - "HPC_OL7" - "HPC_OL8" - - "GPU" + - "GPU_OL7" + - "GPU_OL8" default: "HPC_OL7" - visible: - and: - - ${use_marketplace_image} - - not: - - ${use_old_marketplace_image} - - old_marketplace_listing: - type: enum - title: "Image version" - description: "Marketplace listing to use" - required: true - enum: - - "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" - - "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826" - - "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229" - - "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" - default: "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" - visible: - and: - - ${use_marketplace_image} - - ${use_old_marketplace_image} + visible: ${use_marketplace_image} compute_image_compartment: title: "compute image compartment" @@ -1512,22 +1463,17 @@ variables: - and: - ${login_block} - ${login_node} - use_standard_image_login: - type: boolean - title: "use standard login image" - description: > - "Use standard login image (Oracle Linux)" - default: true - visible: ${login_node} unsupported_login: title: "Use unsupported image" description: "Custom image ID for Login Node" type: boolean default: false - visible: - not: - - ${use_standard_image_login} + visible: + and: + - ${login_node} + - not: + - ${use_marketplace_image_login} login_image_compartment: title: "login image compartment" @@ -1535,8 +1481,7 @@ variables: default: ${targetCompartment} visible: and: - - not: - - ${use_standard_image_login} + - ${login_node} - not: - ${unsupported_login} - not: @@ -1551,8 +1496,7 @@ variables: compartmentId: ${login_image_compartment} visible: and: - - not: - - ${use_standard_image_login} + - ${login_node} - not: - ${unsupported_login} - not: @@ -1566,8 +1510,6 @@ variables: visible: and: - ${unsupported_login} - - not: - - ${use_standard_image_login} - not: - ${use_marketplace_image_login} default: "image.ocid" @@ -1578,28 +1520,14 @@ variables: type: string default: "opc" required: true - visible: - not: - - ${use_standard_image_login} + visible: ${login_node} use_marketplace_image_login: type: boolean title: "use marketplace image" description: "Use marketplace image, otherwise provide custom image OCID" default: true - visible: - not: - - ${use_standard_image_login} - use_old_marketplace_image_login: - type: boolean - title: "use older marketplace images" - description: "Images prior to September 2021" - default: false - visible: - and: - - ${use_marketplace_image_login} - - not: - - ${use_standard_image_login} + visible: ${login_node} marketplace_listing_login: type: enum @@ -1609,30 +1537,10 @@ variables: enum: - "HPC_OL7" - "HPC_OL8" - - "GPU" + - "GPU_OL7" + - "GPU_OL8" default: "HPC_OL7" visible: and: - ${use_marketplace_image_login} - - not: - - ${use_old_marketplace_image_login} - - not: - - ${use_standard_image_login} - - old_marketplace_listing_login: - type: enum - title: "Image version" - description: "Marketplace listing to use" - required: true - enum: - - "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" - - "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826" - - "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229" - - "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" - default: "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" - visible: - and: - - ${use_marketplace_image_login} - - ${use_old_marketplace_image_login} - - not: - - ${use_standard_image_login} + - ${login_node} diff --git a/slurm_ha.tf b/slurm_ha.tf index 896b5d28..0d90f75c 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -303,7 +303,6 @@ resource "null_resource" "cluster_backup" { marketplace_listing = var.marketplace_listing, image = local.image_ocid, use_marketplace_image = var.use_marketplace_image, - use_old_marketplace_image = var.use_old_marketplace_image, boot_volume_size = var.boot_volume_size, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, region = var.region, diff --git a/variables.tf b/variables.tf index b49f100d..aea2eaac 100755 --- a/variables.tf +++ b/variables.tf @@ -16,8 +16,6 @@ variable "cluster_name" { default = "" } variable "bastion_ad" {} variable "bastion_shape" { default = "VM.Standard2.4" } variable "bastion_object_storage_par" { default = true } -variable "use_standard_image" { default= true } -variable "use_standard_image_login" { default= true } variable "custom_bastion_image" { type = string default = "image.ocid" @@ -37,7 +35,6 @@ variable "instance_pool_shape" { default = "VM.Standard2.4" } variable "node_count" { default = 2 } variable "boot_volume_size" { default = 50 } variable "use_marketplace_image" { default = true} -variable "use_old_marketplace_image" { default = false} variable "image" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "unsupported_bastion_image" { default = "" } @@ -85,10 +82,7 @@ variable "privilege_group_name" { default = "privilege" } variable "marketplace_listing" { default = "HPC_OL7" -} -variable "old_marketplace_listing" { - default = "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" -} +} variable "marketplace_version_id" { type = map(string) default = { @@ -98,18 +92,14 @@ variable "marketplace_version_id" { "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" - "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "GPU_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "GPU_OL8" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" } } # To find the Appcatalog OCID, run # oci compute pic listing list --display-name "Oracle Linux 7 - HPC Cluster Networking Image" -variable "old_marketplace_listing_id" { - default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" -} + variable "marketplace_listing_id_HPC" { default = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" } @@ -196,6 +186,10 @@ variable "unsupported_bastion" { type=bool default = false } +variable "use_marketplace_image_bastion" { + type=bool + default = true +} variable "unsupported_login" { type=bool default = false @@ -251,13 +245,11 @@ variable "log_vol" { default = false } variable "redundancy" { default = true } variable "use_marketplace_image_login" { default = true} -variable "use_old_marketplace_image_login" { default = false} variable "marketplace_listing_login" { default = "HPC_OL7" } - -variable "old_marketplace_listing_login" { - default = "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" -} +variable "marketplace_listing_bastion" { + default = "HPC_OL7" +} \ No newline at end of file From 506e7b564734f35d1a91bc35527743737cb48525 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 26 Sep 2023 15:58:34 -0600 Subject: [PATCH 02/68] Change SSH Key algo for Ubuntu 22.04 support --- data.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data.tf b/data.tf index 39195858..3325be77 100755 --- a/data.tf +++ b/data.tf @@ -3,8 +3,7 @@ resource "random_pet" "name" { } resource "tls_private_key" "ssh" { - algorithm = "RSA" - rsa_bits = "4096" + algorithm = "ED25519" } data "oci_core_services" "services" { From 78a449e0a2ab64018b479ab2bf30812324d5f87c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 27 Sep 2023 12:47:30 -0600 Subject: [PATCH 03/68] Copy the generated public key rather than regen --- bastion.tf | 13 ++++++++++++- playbooks/roles/ssh/tasks/common.yml | 15 +++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/bastion.tf b/bastion.tf index e4313779..49b33ecf 100644 --- a/bastion.tf +++ b/bastion.tf @@ -200,7 +200,7 @@ resource "null_resource" "bastion" { } provisioner "file" { - content = tls_private_key.ssh.private_key_pem + content = tls_private_key.ssh.private_key_openssh destination = "/home/${var.bastion_username}/.ssh/cluster.key" connection { host = local.host @@ -209,6 +209,17 @@ resource "null_resource" "bastion" { private_key = tls_private_key.ssh.private_key_pem } } + + provisioner "file" { + content = tls_private_key.ssh.public_key_openssh + destination = "/home/${var.bastion_username}/.ssh/id_rsa.pub" + connection { + host = local.host + type = "ssh" + user = var.bastion_username + private_key = tls_private_key.ssh.private_key_pem + } + } } resource "null_resource" "cluster" { depends_on = [null_resource.bastion, null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ] diff --git a/playbooks/roles/ssh/tasks/common.yml b/playbooks/roles/ssh/tasks/common.yml index 496a8dc4..41872c6d 100644 --- a/playbooks/roles/ssh/tasks/common.yml +++ b/playbooks/roles/ssh/tasks/common.yml @@ -7,7 +7,7 @@ group: root mode: '0644' -- name: Install ssh keys on all nodes +- name: Install private ssh key on all nodes copy: dest: "/home/{{ ansible_user }}/.ssh/id_rsa" src: "/home/{{ bastion_username }}/.ssh/{{ item }}" @@ -17,13 +17,12 @@ with_items: - cluster.key -- name: Generate an OpenSSL public key in OpenSSH v2 format - community.crypto.openssl_publickey: - path: "/home/{{ ansible_user }}/.ssh/id_rsa.pub" - privatekey_path: "/home/{{ ansible_user }}/.ssh/id_rsa" - format: OpenSSH - state: present +- name: Install public ssh key on all nodes + copy: + dest: "/home/{{ ansible_user }}/.ssh/id_rsa.pub" + src: "/home/{{ bastion_username }}/.ssh/{{ item }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" mode: '0644' - register: public_key + with_items: + - id_rsa.pub \ No newline at end of file From 46ef6a23f6bd3a3b85ff7c28c883d5af7586b6e4 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 27 Sep 2023 12:47:56 -0600 Subject: [PATCH 04/68] Ubuntu 22.04 changes --- bin/bastion.sh | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index 0cfc7d31..73e38948 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -100,12 +100,24 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo apt-get -y install ansible fi fix_apt - sudo apt-get -y install python python-netaddr python3 python3-pip + + if [ $ID == "ubuntu" ] && [ $VERSION_ID == "22.04" ] ; then + sudo sed -i 's/#$nrconf{restart} = '"'"'i'"'"';/$nrconf{restart} = '"'"'a'"'"';/g' /etc/needrestart/needrestart.conf + sudo apt-get -y install python3 python3-netaddr python3-pip + sudo ln -s /usr/bin/python3 /usr/bin/python + else + sudo apt-get -y install python python-netaddr python3 python3-pip + fi output=$? if [ $output -ne 0 ] then fix_apt - sudo apt-get -y install python python-netaddr python3 python3-pip + if [ $ID == "ubuntu" ] && [ $VERSION_ID == "22.04" ] ; then + sudo apt-get -y install python3 python3-netaddr python3-pip + + else + sudo apt-get -y install python python-netaddr python3 python3-pip + fi fi fix_apt @@ -153,11 +165,21 @@ ansible-galaxy collection install community.crypto --force > /dev/null threads=$(nproc) forks=$(($threads * 8)) -sudo sed -i "s/^#forks.*/forks = ${forks}/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#fact_caching=.*/fact_caching=jsonfile/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#fact_caching_connection.*/fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#retries.*/retries=5/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg \ No newline at end of file +if [ ! -d /etc/ansible ] ; then + sudo mkdir /etc/ansible + if [ $ID == "ubuntu" ] ; then + sudo chown ubuntu:ubuntu /etc/ansible + else + sudo chown opc:opc /etc/ansible + fi + ansible-config init --disabled -t all > /etc/ansible/ansible.cfg +fi + +sudo sed -i "s/^\(#\|;\)forks.*/forks = ${forks}/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)fact_caching=.*/fact_caching=jsonfile/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)fact_caching_connection.*/fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)retries.*/retries=5/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg +sudo sed -i "s/^\(#\|;\)command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg \ No newline at end of file From ff9c3b4a19ab8c5d2e3e30fa37572646164839e1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 27 Sep 2023 12:48:17 -0600 Subject: [PATCH 05/68] Update package names for Ubuntu 2204 --- playbooks/roles/packages/tasks/main.yml | 5 +++- .../roles/packages/tasks/ubuntu-2204.yml | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 playbooks/roles/packages/tasks/ubuntu-2204.yml diff --git a/playbooks/roles/packages/tasks/main.yml b/playbooks/roles/packages/tasks/main.yml index cdc1f36a..24cc3ed3 100755 --- a/playbooks/roles/packages/tasks/main.yml +++ b/playbooks/roles/packages/tasks/main.yml @@ -8,7 +8,10 @@ when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' - include: ubuntu.yml - when: ansible_distribution == 'Ubuntu' + when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version < '22' + +- include: ubuntu-2204.yml + when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '22' - include: debian.yml when: ansible_distribution == 'Debian' \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/ubuntu-2204.yml b/playbooks/roles/packages/tasks/ubuntu-2204.yml new file mode 100644 index 00000000..a3b9541a --- /dev/null +++ b/playbooks/roles/packages/tasks/ubuntu-2204.yml @@ -0,0 +1,23 @@ +--- +- block: + - name: Automatically restart the services + become: true + replace: + path: /etc/needrestart/needrestart.conf + regexp: "#$nrconf{restart} = 'i';" + replace: "$nrconf{restart} = 'a';" + - name: Make sure python OpenSSL and parallel ssh is installed + vars: + package_name: + - python3-openssl + - python3-cryptography + - parted + - pssh + - pdsh + - python3-netaddr + - jq + - python3-pip + package_state: latest + include_role: + name: safe_yum + ignore_errors: true \ No newline at end of file From 101b95841a4fd26763eac3bc7ad3855273e377b8 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 27 Sep 2023 14:12:43 -0600 Subject: [PATCH 06/68] Update Mysql for Ubuntu 22.04 --- playbooks/roles/mysql/tasks/centos.yml | 2 ++ playbooks/roles/mysql/tasks/debian.yml | 31 +++++++++++++++---- playbooks/roles/mysql/tasks/el.yml | 2 ++ .../{defaults/main.yml => vars/ol_vars.yml} | 0 .../roles/mysql/vars/ubuntu-2204_vars.yml | 25 +++++++++++++++ playbooks/roles/mysql/vars/ubuntu_vars.yml | 25 +++++++++++++++ 6 files changed, 79 insertions(+), 6 deletions(-) rename playbooks/roles/mysql/{defaults/main.yml => vars/ol_vars.yml} (100%) create mode 100644 playbooks/roles/mysql/vars/ubuntu-2204_vars.yml create mode 100644 playbooks/roles/mysql/vars/ubuntu_vars.yml diff --git a/playbooks/roles/mysql/tasks/centos.yml b/playbooks/roles/mysql/tasks/centos.yml index 710c4a81..15f8dd30 100644 --- a/playbooks/roles/mysql/tasks/centos.yml +++ b/playbooks/roles/mysql/tasks/centos.yml @@ -1,6 +1,8 @@ --- # tasks for mariadb role +- include_vars: ol_vars.yml + - name: Create /etc/opt/oci-hpc/passwords/mysql become: true file: diff --git a/playbooks/roles/mysql/tasks/debian.yml b/playbooks/roles/mysql/tasks/debian.yml index 9d4b47be..a689d7c2 100644 --- a/playbooks/roles/mysql/tasks/debian.yml +++ b/playbooks/roles/mysql/tasks/debian.yml @@ -1,6 +1,12 @@ --- # tasks for mariadb role +- include_vars: ubuntu_vars.yml + when: ansible_distribution_major_version < "22" + +- include_vars: ubuntu-2204_vars.yml + when: ansible_distribution_major_version == "22" + - name: Create /etc/opt/oci-hpc/passwords/mysql become: true file: @@ -28,6 +34,7 @@ apt_repository: repo: deb http://archive.ubuntu.com/ubuntu bionic main state: present + when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version < "22" - name: Install MariaDB packages vars: @@ -83,12 +90,23 @@ - meta: flush_handlers # added to resolve error unable to connect to database,check login_user and login_password are correct or/root/.my.cnf has the credentials.Exception message: (1698, "Access denied for user ''root''@''localhost''")' - - name: Change the authentication plugin of MySQL root user to mysql_native_password - shell: mysql -u root -e 'UPDATE mysql.user SET plugin="mysql_native_password" WHERE user="root" AND host="localhost"' - - - name: Flush Privileges - shell: mysql -u root -e 'FLUSH PRIVILEGES' - + # - name: Change the authentication plugin of MySQL root user to mysql_native_password + # shell: mysql -u root -e "ALTER USER 'root'@'localhost' IDENTIFIED WITH mysql_native_password" + + #- name: Flush Privileges + #shell: mysql -u root -e 'FLUSH PRIVILEGES' + + - name: Set MariaDB's root password + become: true + mysql_user: + name: 'root' + password: '{{ mysql_root_pwd }}' + host_all: yes + check_implicit_admin: yes + login_unix_socket: /run/mysqld/mysqld.sock + state: present + when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == "22" + - name: Set MariaDB's root password become: true mysql_user: @@ -97,6 +115,7 @@ host_all: yes check_implicit_admin: yes state: present + when: ansible_distribution == 'Ubuntu' and ansible_distribution_major_version < "22" - name: Render /root/.my.cnf become: true diff --git a/playbooks/roles/mysql/tasks/el.yml b/playbooks/roles/mysql/tasks/el.yml index e76fbd6e..32defe9c 100644 --- a/playbooks/roles/mysql/tasks/el.yml +++ b/playbooks/roles/mysql/tasks/el.yml @@ -1,6 +1,8 @@ --- # tasks for mysqld role +- include_vars: ol_vars.yml + - name: Create /etc/opt/oci-hpc/passwords/mysql become: true file: diff --git a/playbooks/roles/mysql/defaults/main.yml b/playbooks/roles/mysql/vars/ol_vars.yml similarity index 100% rename from playbooks/roles/mysql/defaults/main.yml rename to playbooks/roles/mysql/vars/ol_vars.yml diff --git a/playbooks/roles/mysql/vars/ubuntu-2204_vars.yml b/playbooks/roles/mysql/vars/ubuntu-2204_vars.yml new file mode 100644 index 00000000..bd163b8b --- /dev/null +++ b/playbooks/roles/mysql/vars/ubuntu-2204_vars.yml @@ -0,0 +1,25 @@ +--- +mariadb_packages: + - mariadb + - mariadb-server + - MySQL-python + +mysql_packages: + - mysql-community-server + - mysql-community-client + - MySQL-python + +mysql_packages_ol8: + - mysql-server + - mysql + - mysql-connector-python + +deb_mariadb_packages: + - mariadb-server + - mariadb-common + - python3-mysqldb + - python3-pymysql +# added above for mariadb + +mariadb_db_path: '/var/lib/mysql' +mysql_db_path: '/var/lib/mysql' diff --git a/playbooks/roles/mysql/vars/ubuntu_vars.yml b/playbooks/roles/mysql/vars/ubuntu_vars.yml new file mode 100644 index 00000000..af3189aa --- /dev/null +++ b/playbooks/roles/mysql/vars/ubuntu_vars.yml @@ -0,0 +1,25 @@ +--- +mariadb_packages: + - mariadb + - mariadb-server + - MySQL-python + +mysql_packages: + - mysql-community-server + - mysql-community-client + - MySQL-python + +mysql_packages_ol8: + - mysql-server + - mysql + - mysql-connector-python + +deb_mariadb_packages: + - mariadb-server + - mariadb-common + - python-mysqldb + - python3-pymysql +# added above for mariadb + +mariadb_db_path: '/var/lib/mysql' +mysql_db_path: '/var/lib/mysql' From cb96668bf8c75f14f092f9316cbb6aed90feff02 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 27 Sep 2023 15:25:41 -0600 Subject: [PATCH 07/68] Change Slurm version (Include cgroup v2 support) --- playbooks/roles/slurm/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/defaults/main.yml b/playbooks/roles/slurm/defaults/main.yml index 8b0f3f40..a4983e9c 100755 --- a/playbooks/roles/slurm/defaults/main.yml +++ b/playbooks/roles/slurm/defaults/main.yml @@ -9,4 +9,4 @@ slurm_uid: 1501 munge_gid: 1500 munge_uid: 1500 rack_aware_playbook_suffix: "{% if rack_aware|bool %}-rack-aware{% endif%}" -slurm_version: "23.02.1-1" \ No newline at end of file +slurm_version: "23.02.5-1" \ No newline at end of file From d4905037a044be0f50fa8c44612a6927b2780e3e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 21:50:42 -0600 Subject: [PATCH 08/68] Fix limits in case of OpenBLAS workload --- playbooks/roles/limits/templates/limits.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/limits/templates/limits.j2 b/playbooks/roles/limits/templates/limits.j2 index 6c872c2a..862e70b2 100755 --- a/playbooks/roles/limits/templates/limits.j2 +++ b/playbooks/roles/limits/templates/limits.j2 @@ -18,8 +18,8 @@ #### {% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8"%} -* soft nproc 10240 -* hard nproc 10240 +* soft nproc 40960 +* hard nproc 40960 * soft nofile 20480 * hard nofile 20480 {% else %} From 9c993d0a729c4769941e27da16a66ebaae54043e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 22:10:50 -0600 Subject: [PATCH 09/68] Anoop: Update ib_write_bw.sh --- scripts/ib_write_bw.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/ib_write_bw.sh b/scripts/ib_write_bw.sh index 4af85b6e..58138096 100644 --- a/scripts/ib_write_bw.sh +++ b/scripts/ib_write_bw.sh @@ -20,7 +20,7 @@ dis_help() echo echo "Logs are stored at /tmp/logs" echo - echo "e.g., sh ./ib_write_bw.sh -s compute-permanent-node-1 -n compute-permanent-node-2 -c y -g 2 + echo "e.g., sh ./ib_write_bw.sh -s compute-permanent-node-1 -n compute-permanent-node-2 -c y -g 2" echo echo "Supported shapes: BM.GPU.B4.8,BM.GPU.A100-v2.8,BM.GPU4.8" echo @@ -51,13 +51,6 @@ do esac done -#Set variables -cuda_path=`ssh $server /usr/sbin/alternatives --list|grep cuda | awk -F" " '{print $3}'|tail -1`/targets/x86_64-linux/include/cuda.h -server_ip=`grep $server /etc/hosts |grep -v rdma|awk '{print $1}'` -logdir=/tmp/logs/ib_bw/`date +%F-%H` -outdir=/tmp/ib_bw/ -gpu_count=`ssh $server nvidia-smi -L |wc -l` - #Check node shape shape=`ssh $server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape'` @@ -76,6 +69,13 @@ else exit fi +#Set variables +cuda_path=`ssh $server /usr/sbin/alternatives --list|grep cuda | awk -F" " '{print $3}'|tail -1`/targets/x86_64-linux/include/cuda.h +server_ip=`grep $server /etc/hosts |grep -v rdma|awk '{print $1}'` +logdir=/tmp/logs/ib_bw/`date +%F-%H` +outdir=/tmp/ib_bw/ +gpu_count=`ssh $server nvidia-smi -L |wc -l` + #check cuda installation ssh -q $server [[ -f $cuda_path ]] && echo " " || echo "Please check cuda installation; exit 1"; From c751575abe8b0d18b409441d7bf4b65f9d042ce5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 22:11:16 -0600 Subject: [PATCH 10/68] Add a sleep to give time to ansible --- bin/bastion.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/bastion.sh b/bin/bastion.sh index 73e38948..e6f17cba 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -97,6 +97,7 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then if [ $output -ne 0 ] then fix_apt + sleep 60s sudo apt-get -y install ansible fi fix_apt From a9ce42550d14d30cbaf6c2b3b40093cddd6b3a46 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 22:11:32 -0600 Subject: [PATCH 11/68] Anoop: Fix Openldap for 22.04 --- playbooks/roles/openldap/defaults/main.yml | 8 +++++++- playbooks/roles/openldap/vars/debian_vars.yml | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/openldap/defaults/main.yml b/playbooks/roles/openldap/defaults/main.yml index 559e6fd3..e81c6cb4 100644 --- a/playbooks/roles/openldap/defaults/main.yml +++ b/playbooks/roles/openldap/defaults/main.yml @@ -13,9 +13,15 @@ openldap_tls_cacrt: '{{ ssl_ca_cert }}' openldap_tls_crt: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.crt' openldap_tls_key: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.key' -openldap_schemas: +openldap_schemas_20: - cosine - inetorgperson - rfc2307bis - autoinc - ppolicy + +openldap_schemas_22: + - cosine + - inetorgperson + - rfc2307bis + - autoinc diff --git a/playbooks/roles/openldap/vars/debian_vars.yml b/playbooks/roles/openldap/vars/debian_vars.yml index bb2fc0a6..d604763b 100644 --- a/playbooks/roles/openldap/vars/debian_vars.yml +++ b/playbooks/roles/openldap/vars/debian_vars.yml @@ -8,7 +8,7 @@ openldap_packages: - libsasl2-dev - libldap2-dev - libssl-dev - - python-pexpect + - python3-pexpect - rpcbind - nscd - libpam-ldap From ed245345e58cfcb4c2e62023d9796adcc1c7350f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 22:37:51 -0600 Subject: [PATCH 12/68] Fix OpenLDAP for 22.04 --- playbooks/roles/openldap/tasks/debian.yml | 29 ++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 49215f6c..2b4dc2fc 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -143,11 +143,20 @@ command: dpkg-reconfigure -f noninteractive slapd when: not dpkgcheck.found - - name: Load OpenLDAP schemas + + - name: Load OpenLDAP schemas 20.04 + command: ldapadd -c -Y EXTERNAL -H ldapi:/// -Q -f /etc/ldap/schema/{{ item }}.ldif + args: + creates: '/etc/ldap/slapd.d/cn=config/cn=schema/cn={?}{{ item }}.ldif' + with_items: '{{ openldap_schemas_20 }}' + when: ansible_distribution_version == '20.04' + + - name: Load OpenLDAP schemas 22.04 command: ldapadd -c -Y EXTERNAL -H ldapi:/// -Q -f /etc/ldap/schema/{{ item }}.ldif args: creates: '/etc/ldap/slapd.d/cn=config/cn=schema/cn={?}{{ item }}.ldif' - with_items: '{{ openldap_schemas }}' + with_items: '{{ openldap_schemas_22 }}' + when: ansible_distribution_version == '22.04' - name: Render OpenLDAP configuration - config template: @@ -179,7 +188,7 @@ creates: '/etc/ldap/slapd.d/cn=config.ldif' notify: restart openldap - - name: Load OpenLDAP local configuration + - name: Load OpenLDAP local configuration 20.04 command: ldapmodify -c -Y EXTERNAL -H ldapi:/// -Q -f /tmp/{{ item }} args: creates: '/etc/ldap/slapd.d/cn=config/olcDatabase={?}mdb/olcOverlay={0}memberof.ldif' @@ -190,6 +199,20 @@ - local.ldif - ppolicy.ldif - memberof.ldif + when: ansible_distribution_version == '20.04' + notify: restart openldap + + - name: Load OpenLDAP local configuration 22.04 + command: ldapmodify -c -Y EXTERNAL -H ldapi:/// -Q -f /tmp/{{ item }} + args: + creates: '/etc/ldap/slapd.d/cn=config/olcDatabase={?}mdb/olcOverlay={0}memberof.ldif' + register: result + failed_when: ( result.rc not in [ 0, 20 ] ) + with_items: + - config.ldif + - local.ldif + - memberof.ldif + when: ansible_distribution_version == '22.04' notify: restart openldap - name: Check local schemas From 68fd0f8c1fa35965f8cec3b70ca1d47ca050af82 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 28 Sep 2023 22:50:54 -0600 Subject: [PATCH 13/68] Fix compute_pam on Ubuntu 22.04 --- playbooks/roles/slurm/tasks/compute_pam.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/slurm/tasks/compute_pam.yml b/playbooks/roles/slurm/tasks/compute_pam.yml index 876e1b5a..c66d9ead 100644 --- a/playbooks/roles/slurm/tasks/compute_pam.yml +++ b/playbooks/roles/slurm/tasks/compute_pam.yml @@ -58,7 +58,7 @@ backup: yes when: ansible_distribution == 'Ubuntu' -- name: Comment pam_systemd.so in /etc/pam.d/systemd-user +- name: Comment pam_systemd.so in /etc/pam.d/systemd-user in 20.04 become: true lineinfile: path: /etc/pam.d/systemd-user @@ -66,7 +66,7 @@ line: "#session optional pam_systemd.so" state: present backup: yes - when: ansible_distribution == 'Ubuntu' + when: ansible_distribution == 'Ubuntu' and ansible_distribution_version == '20.04' - name: Comment pam_access.so in /etc/pam.d/common-auth become: true From 2a1e8de8d42c3d9a586b18651332010684e4fd08 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 29 Sep 2023 10:40:04 -0600 Subject: [PATCH 14/68] Add separate Slurm .deb for Ubuntu 20.04 and 22.04 --- playbooks/roles/slurm/tasks/common.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 4b86b7f3..804a2e63 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -63,8 +63,8 @@ - name: Download slurm .deb get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" - dest: "{{ download_path }}/slurm_rpms" + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms/" when: ansible_os_family == 'Debian' and download_path == '/tmp' - name: Download slurm .rpm @@ -76,7 +76,7 @@ - name: Download slurm .deb get_url: - url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" dest: "{{ download_path }}/slurm_rpms" when: ansible_os_family == 'Debian' and download_path != '/tmp' delegate_to: 127.0.0.1 @@ -94,7 +94,7 @@ - name: Install .deb vars: deb_name: - - "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}_amd64.deb" + - "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}_{{ansible_distribution_version}}_amd64.deb" package_state: present include_role: name: safe_yum From 841cfeb568479b1b9fffa39f900bda852f0c34db Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 29 Sep 2023 13:31:18 -0600 Subject: [PATCH 15/68] Change the prep sample files to go to current home --- samples/prep_sample_files.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/prep_sample_files.sh b/samples/prep_sample_files.sh index 0b9d78fb..6c7177d2 100644 --- a/samples/prep_sample_files.sh +++ b/samples/prep_sample_files.sh @@ -7,7 +7,7 @@ do sudo chmod +x $directory/*.sh done; -cp nccl_compile/compile.sh /home/opc/ -cp gpu/*.sbatch /home/opc/ -cp /opt/oci-hpc/bin/node_ordering_by_rack.py /home/opc/ +cp nccl_compile/compile.sh ~ +cp gpu/*.sbatch ~ +cp /opt/oci-hpc/bin/node_ordering_by_rack.py ~ From 6de19b424edc397b7645c08a5e1f4c9fdce8035b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 29 Sep 2023 13:32:40 -0600 Subject: [PATCH 16/68] Give the right permissions to prep_sample_files.sh --- bastion.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/bastion.tf b/bastion.tf index 49b33ecf..3982a859 100644 --- a/bastion.tf +++ b/bastion.tf @@ -477,6 +477,7 @@ provisioner "file" { "chmod a+x /opt/oci-hpc/bin/*.sh", "timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", + "chmod 755 /opt/oci-hpc/samples/*.sh", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log", From 616a44ffa2e38611643477cce232dbffa742377f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 29 Sep 2023 15:54:37 -0600 Subject: [PATCH 17/68] Remove warn as it depreciated from ansible --- playbooks/roles/mpivars/tasks/ubuntu.yml | 1 - playbooks/roles/nvidia_peermem/tasks/common.yml | 1 - playbooks/roles/slurm/tasks/backup_server.yml | 2 -- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 2 -- playbooks/roles/slurm/tasks/compute.yml | 2 -- playbooks/roles/slurm/tasks/login.yml | 2 -- playbooks/roles/slurm/tasks/server.yml | 2 -- 7 files changed, 12 deletions(-) diff --git a/playbooks/roles/mpivars/tasks/ubuntu.yml b/playbooks/roles/mpivars/tasks/ubuntu.yml index e87ab94c..28a313f2 100644 --- a/playbooks/roles/mpivars/tasks/ubuntu.yml +++ b/playbooks/roles/mpivars/tasks/ubuntu.yml @@ -2,7 +2,6 @@ - name: Get the openmpi version shell: cmd: ls /usr/mpi/gcc/ - warn: false register: openmpi failed_when: false diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index d5764cd8..17178f29 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -2,7 +2,6 @@ - name: Check if its a GPU shape shell: cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU" - warn: false register: shape_gpu failed_when: false diff --git a/playbooks/roles/slurm/tasks/backup_server.yml b/playbooks/roles/slurm/tasks/backup_server.yml index 096f746f..5e931304 100755 --- a/playbooks/roles/slurm/tasks/backup_server.yml +++ b/playbooks/roles/slurm/tasks/backup_server.yml @@ -55,7 +55,6 @@ become: true shell: cmd: cp /etc/munge/munge.key /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true @@ -63,7 +62,6 @@ become: true shell: cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 2d43c724..bd270e32 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -50,7 +50,6 @@ become: true shell: cmd: cp /etc/munge/munge.key /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true @@ -58,7 +57,6 @@ become: true shell: cmd: chown {{ bastion_username }}:{{ bastion_username }} /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 66ca5ed4..56666f8f 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -53,7 +53,6 @@ become: true shell: cmd: cp /etc/munge/munge.key /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true @@ -61,7 +60,6 @@ become: true shell: cmd: chown {{ bastion_username }}:{{ bastion_username }} /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/slurm/tasks/login.yml b/playbooks/roles/slurm/tasks/login.yml index 48998e34..d68da67f 100755 --- a/playbooks/roles/slurm/tasks/login.yml +++ b/playbooks/roles/slurm/tasks/login.yml @@ -49,7 +49,6 @@ become: true shell: cmd: cp /etc/munge/munge.key /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true @@ -57,7 +56,6 @@ become: true shell: cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key - warn: false delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/slurm/tasks/server.yml b/playbooks/roles/slurm/tasks/server.yml index 433152fe..9610b527 100755 --- a/playbooks/roles/slurm/tasks/server.yml +++ b/playbooks/roles/slurm/tasks/server.yml @@ -100,12 +100,10 @@ become: true shell: cmd: cp /etc/munge/munge.key /tmp/munge.key - warn: false - name: set permissions become: true shell: cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key - warn: false - name: Create DB for accounting become: true From 5452e0e773ecfe35690c0a2602ab24f3504d1640 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 29 Sep 2023 15:55:05 -0600 Subject: [PATCH 18/68] Update the ansible repo to the latest. --- bin/bastion.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index e6f17cba..6f0f5343 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -91,7 +91,8 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo apt -y --fix-broken install fix_apt - + + sudo add-apt-repository --yes --update ppa:ansible/ansible sudo apt-get -y install ansible output=$? if [ $output -ne 0 ] From 7b64344bed002917a49680cb61d8f8723984ea4d Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Mon, 2 Oct 2023 16:42:51 -0700 Subject: [PATCH 19/68] Updated for H100 --- playbooks/roles/limits/templates/limits.j2 | 2 +- playbooks/roles/rdma-interface/tasks/el.yml | 6 +- playbooks/roles/slurm/templates/gres.conf.j2 | 3 + playbooks/roles/slurm/templates/slurm.conf.j2 | 4 + playbooks/roles/tuned/tasks/main.yml | 2 +- playbooks/roles/weka_client/tasks/main.yml | 327 ++++++++++++++++++ schema.yaml | 4 +- 7 files changed, 341 insertions(+), 7 deletions(-) create mode 100755 playbooks/roles/weka_client/tasks/main.yml diff --git a/playbooks/roles/limits/templates/limits.j2 b/playbooks/roles/limits/templates/limits.j2 index 862e70b2..1a5b5947 100755 --- a/playbooks/roles/limits/templates/limits.j2 +++ b/playbooks/roles/limits/templates/limits.j2 @@ -17,7 +17,7 @@ ####* soft stack 1048576 #### -{% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8"%} +{% if shape == "BM.GPU.B4.8" or shape == "BM.GPU4.8" or shape == "BM.GPU.A100-v2.8" or shape == "BM.GPU.H100.8" %} * soft nproc 40960 * hard nproc 40960 * soft nofile 20480 diff --git a/playbooks/roles/rdma-interface/tasks/el.yml b/playbooks/roles/rdma-interface/tasks/el.yml index e37e2ce4..21d02c50 100755 --- a/playbooks/roles/rdma-interface/tasks/el.yml +++ b/playbooks/roles/rdma-interface/tasks/el.yml @@ -77,7 +77,7 @@ value: '2' sysctl_file: /etc/sysctl.d/80-network.conf reload: yes - when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') and ( not new_image.stat.exists ) + when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8' or shape == 'BM.GPU.H100.8') and ( not new_image.stat.exists ) loop: "{{ ansible_mlx }}" loop_control: index_var: index @@ -88,7 +88,7 @@ value: '2' sysctl_file: /etc/sysctl.d/80-network.conf reload: yes - when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') and ( not new_image.stat.exists ) + when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8' or shape == 'BM.GPU.H100.8') and ( not new_image.stat.exists ) loop: "{{ ansible_mlx }}" loop_control: index_var: index @@ -100,7 +100,7 @@ value: '1' sysctl_file: /etc/sysctl.d/80-network.conf reload: yes - when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') and ( not new_image.stat.exists ) + when: ansible_mlx is defined and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8' or shape == 'BM.GPU.H100.8') and ( not new_image.stat.exists ) loop: "{{ ansible_mlx }}" loop_control: index_var: index \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index 8854db24..5010eb10 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -30,6 +30,9 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +{% elif instance.shape == "BM.GPU.H100.8"%} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=H100 Cores=[48-55] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=H100 Cores=[56-111] {% elif instance.shape == "BM.GPU.T1.2" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] {% elif instance.shape == "BM.GPU.A10.4" %} diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index de6734ab..0fbc07c1 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -89,6 +89,10 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=8 CoresPerSocket=16 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 {% elif instance.shape == "BM.GPU.A100-v2.8" and threadspercore == 2 %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 +{% elif instance.shape == "BM.GPU.H100.8" and threadspercore == 1 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=56 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:H100:8 +{% elif instance.shape == "BM.GPU.H100.8" and threadspercore == 2 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=56 ThreadsPerCore=2 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:H100:8 {% elif instance.shape == "BM.GPU.T1.2" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 {% elif instance.shape == "BM.GPU.A10.4" %} diff --git a/playbooks/roles/tuned/tasks/main.yml b/playbooks/roles/tuned/tasks/main.yml index 637e8bae..15793bee 100644 --- a/playbooks/roles/tuned/tasks/main.yml +++ b/playbooks/roles/tuned/tasks/main.yml @@ -1,2 +1,2 @@ - include: el-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8' or shape == 'BM.GPU.H100.8') diff --git a/playbooks/roles/weka_client/tasks/main.yml b/playbooks/roles/weka_client/tasks/main.yml new file mode 100755 index 00000000..6d4b0887 --- /dev/null +++ b/playbooks/roles/weka_client/tasks/main.yml @@ -0,0 +1,327 @@ +--- +# tasks file for weka_client + + +# +# GET INSTANCE/VNIC METADATA +# +- name: Get instance metadata + uri: + url: http://169.254.169.254/opc/v2/instance/ + method: GET + return_content: true + status_code: 200 + body_format: json + headers: + Authorization: Bearer Oracle + register: instance_metadata + +- name: Get vnic metadata + uri: + url: http://169.254.169.254/opc/v1/vnics/ + method: GET + return_content: true + status_code: 200 + body_format: json + headers: + Authorization: Bearer Oracle + register: vnic_metadata + +- name: Set vnic facts from metadata + set_fact: + private_ip: "{{ vnic_metadata.json[0].privateIp }}" + subnet_cidr_block: "{{ vnic_metadata.json[0].subnetCidrBlock[-2:] }}" + gateway: "{{ vnic_metadata.json[0].virtualRouterIp }}" + +- name: Set vnic facts from metadata + set_fact: + private_ip_2: "{{ vnic_metadata.json[1].privateIp }}" + subnet_cidr_block_2: "{{ vnic_metadata.json[1].subnetCidrBlock[-2:] }}" + gateway_2: "{{ vnic_metadata.json[1].virtualRouterIp }}" + when: shape == "VM.Standard3.Flex" or shape == "VM.Standard.E4.Flex" or shape == "VM.Optimized3.Flex" + + +# +# SYSTEM CONFIG +# +- name: Change the /etc/os-release to install Weka + become: true + replace: + path: /etc/os-release + regexp: '^NAME="Oracle Linux Server"' + replace: 'NAME="Red Hat Enterprise Linux"' + +- name: Rt already set check + shell: "/usr/sbin/ip rule | wc -l " + register: rt_check + +- name: Mkdir for routing tables and scripts + file: + path: /tmp/weka + state: directory + mode: 0755 + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) +- name: Copy files + copy: + src: files/ + dest: /tmp/weka/ + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + +- name: Copy IP route rules + copy: + src: /tmp/weka/rt_tables + dest: /etc/iproute2/rt_tables + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + + +- name: Get interface names + shell: ip a | grep BROADCAST | awk '{ print $2 }' | grep -v 'ens\|docker\|lo' | sed 's/://' + register: interface_names + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + +- name: Create list of interface names + copy: + dest: "/tmp/interface_names" + content: | + {{ interface_names.stdout }} + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + +- name: Loop over NIC names and set up routing tables and rules + shell: | + TABLE=0 + + while read -r NIC; do + IP='' + TABLE=$((TABLE+1)) + + while read -r line; do + if [[ $line =~ IPADDR=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+) ]]; then + IP=${BASH_REMATCH[1]} + fi + done < "/etc/sysconfig/network-scripts/ifcfg-$line" + + echo "192.168.0.0/16 dev $NIC src $IP table net$TABLE" > /etc/sysconfig/network-scripts/route-$line + echo "table net$TABLE from $IP" > /etc/sysconfig/network-scripts/rule-$line + done < /tmp/interface_names + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + +- name: Reset Network Interfaces + shell: "for nic in $(cat /tmp/interface_names); do sudo ifdown $nic; sudo ifup $nic; done" + when: rt_check.stdout | int != 19 and ( + (shape == "BM.GPU4.8") or + (shape == "BM.GPU.B4.8") or + (shape == "BM.GPU.A100-v2.8") or + (shape == "BM.GPU.H100.8") ) + +# +# SECONDARY VNIC SETUP +# + +- name: Get VNIC Number + shell: "export LC_ALL=C.UTF-8;export LANG=C.UTF-8; /usr/bin/oci compute instance list-vnics --auth instance_principal --instance-id {{instance_metadata.json.id}} | jq '.[] | length'" + register: vnic_number + when: shape == "VM.Standard3.Flex" or shape == "VM.Standard.E4.Flex" or shape == "VM.Optimized3.Flex" + +- name: attach secondary vnic + shell: "export LC_ALL=C.UTF-8;export LANG=C.UTF-8; oci compute instance attach-vnic --instance-id {{instance_metadata.json.id}} --vnic-display-name 'weka' --subnet-id {{secondary_vnic_subnet}} --auth instance_principal" + when: (shape == "VM.Standard3.Flex" or shape == "VM.Standard.E4.Flex" or shape == "VM.Optimized3.Flex") and vnic_number.stdout | int < 2 + register: add_vnic + delegate_to: 127.0.0.1 + +- name: Sleep + pause: + seconds: 30 + when: (shape == "VM.Standard3.Flex" or shape == "VM.Standard.E4.Flex" or shape == "VM.Optimized3.Flex") and vnic_number.stdout | int < 2 + + +# +# WEKA INSTALL +# + +- name: Check is Weka installed + command: command -v weka >/dev/null 2>&1 + register: installed + ignore_errors: true + +- name: Download Weka installer + get_url: + url: "http://{{ dist_server }}:14000/dist/v1/install" + dest: /tmp/weka_client_install + mode: 0755 + when: installed is failed + register: download + +- name: Execute the Weka installer + shell: /tmp/weka_client_install + when: download.changed + +- name: Remove the weka_client_install + file: + path: /tmp/weka_client_install + state: absent + +- name: Change the /etc/wekaio/service.conf to install Weka + become: true + replace: + path: /etc/wekaio/service.conf + regexp: '^isolate_cpusets=true' + replace: 'isolate_cpusets=false' + + +# +# MOUNT SETUP +# + +- name: Create Weka DPDK mount points + file: + path: "{{ item.mountpoint }}" + owner: opc + group: "${privilege_group_name}" + mode: 0775 + state: directory + loop: "{{ dpdk_fs | default([]) }}" + when: dpdk_fs is defined + + +- name: Create Weka UDP mount points + file: + path: "{{ item.mountpoint }}" + owner: opc + group: "${privilege_group_name}" + mode: 0775 + state: directory + loop: "{{ udp_fs }}" + when: upd_fs is defined + + +- name: Create Weka NFS mount points + file: + path: "{{ item.mountpoint }}" + owner: opc + group: "${privilege_group_name}" + mode: 0775 + state: directory + loop: "{{ weka_nfs }}" + when: weka_nfs is defined + + +- name: Mount Weka DPDK Filesystems GPU Nodes + mount: + path: "{{ item.mountpoint }}" + src: "{{ backends | join(',') }}/{{item.name}}" + fstype: wekafs + opts: "{{ dpdk_opts }},net={{ ansible_default_ipv4.alias }}{{ item.opts }}" + state: mounted + when: (shape == "BM.GPU4.8" or shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8" or shape == "BM.GPU.H100.8") + and dpdk_fs is defined + loop: "{{ dpdk_fs }}" + +# TODO: Add additonal debug info for failing mounts + +- name: Mount Weka DPDK Filesystems to VM nodes + mount: + path: "{{ item.mountpoint }}" + src: "{{ data_backends | join(',') }}/{{item.name}}" + fstype: wekafs + opts: "{{ dpdk_opts }},net=ens5/{{ private_ip_2 }}/{{ subnet_cidr_block_2 }}/{{ gateway_2}}{{ item.opts }}" + state: mounted + when: (shape == "VM.Standard3.Flex" or shape == "VM.Standard.E4.Flex" or shape == "VM.Optimized3.Flex") + and dpdk_fs is defined + loop: "{{ dpdk_fs }}" + +- name: Mount Weka UDP to all nodes + mount: + path: "{{ item.mountpoint }}" + src: "{{ backends | join(',') }}/{{item.name}}" + fstype: wekafs + opts: "{{ upd_opts }},net=udp,{{ item.opts }}" + state: mounted + when: udp_fs is defined + loop: "{{ udp_fs }}" + +- name: Set NFS Mount + set_fact: + nfsmount: "{{ nfs_backends | random }}" + when: weka_nfs is defined and nfs_backends is defined + +- name: Mount Weka-NFS Filesystems to nodes + mount: + path: "{{ item.mountpoint }}" + src: "{{ nfsmount }}:/{{ item.name }}" + fstype: nfs + state: mounted + when: weka_nfs is defined + loop: "{{ weka_nfs }}" + + +# +# WEKA CLEAN-UP SCRIPTS +# + +- name: Create Pre-Shutdowm Weka Script + copy: + dest: /opt/pre-shutdown-weka.sh + mode: 0755 + content: | + #!/bin/bash + + {% for fs in dpdk_fs %} + if mountpoint -q "{{ fs.mountpoint }}" ; then + echo "{{ fs.mountpoint }} is mounted, unmounting" + umount "{{ fs.mountpoint }}" + else + echo "{{ fs.mountpoint }} is already umounted" + fi + {% endfor %} + + {% for fs in udp_fs %} + if mountpoint -q "{{ fs.mountpoint }}" ; then + echo "{{ fs.mountpoint }} is mounted, unmounting" + umount "{{ fs.mountpoint }}" + else + echo "{{ fs.mountpoint }} is already umounted" + fi + {% endfor %} + +- name: Create Pre-shutdown-weka Systemd Service File + template: + src: templates/pre-shutdown-weka.service.j2 + dest: /lib/systemd/system/pre-shutdown-weka.service + +- name: Enable Pre-shutdown-weka Service + systemd: + name: pre-shutdown-weka + enabled: true + daemon_reload: yes + +- name: Change the /etc/os-release to install Weka + become: true + replace: + path: /etc/os-release + regexp: '^NAME="Red Hat Enterprise Linux"' + replace: 'NAME="Oracle Linux Server"' + diff --git a/schema.yaml b/schema.yaml index e69ae82c..1209b857 100755 --- a/schema.yaml +++ b/schema.yaml @@ -364,8 +364,7 @@ variables: required: true bastion_object_storage_par: title: Create Object Storage PAR - description: "Create a PAR (i.e. Pre-Authenticated Request), so that user could use that PAR to upload monitoring metrics to - to Object Storage and share the URL with OCI service teams." + description: "Create a PAR (i.e. Pre-Authenticated Request), so that user could use that PAR to upload monitoring metrics to Object Storage and share the URL with OCI service teams." type: boolean default: true @@ -563,6 +562,7 @@ variables: - "BM.GPU4.8" - "BM.GPU.B4.8" - "BM.GPU.A100-v2.8" + - "BM.GPU.H100.8" - "BM.Optimized3.36" - "BM.HPC.E5.144" default: "BM.HPC2.36" From a08aab34e9444ecb8833e7a5da500af0c94d3ee2 Mon Sep 17 00:00:00 2001 From: Suman Saha Date: Wed, 4 Oct 2023 10:56:42 -0700 Subject: [PATCH 20/68] Made following changes in metrics collection flow. 1) terraform code changes: Added timestamp as suffix of bucket name. 2) shell script changes: Added cluster name, as a prefix of filename. Cluster name would be provided as optional param when the script would be running. --- bastion.tf | 17 +++++++---------- bin/upload_rdma_nic_metrics.sh | 15 +++++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bastion.tf b/bastion.tf index 3982a859..1f91337b 100644 --- a/bastion.tf +++ b/bastion.tf @@ -498,7 +498,9 @@ data "oci_objectstorage_namespace" "compartment_namespace" { } locals { - rdma_nic_metric_bucket_name = "RDMA_NIC_metrics" + current_timestamp = timestamp() + current_timestamp_formatted = formatdate("YYYYMMDDhhmmss", local.current_timestamp) + rdma_nic_metric_bucket_name = format("%s_%s","RDMA_NIC_metrics",local.current_timestamp_formatted) par_path = ".." } /* @@ -506,14 +508,9 @@ saving the PAR into file: ../PAR_file_for_metrics. this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh) */ -data "oci_objectstorage_bucket" "RDMA_NIC_Metrics_bucket_check" { - name = local.rdma_nic_metric_bucket_name - namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace -} - resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" { - count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + count = (var.bastion_object_storage_par) ? 1 : 0 compartment_id = var.targetCompartment name = local.rdma_nic_metric_bucket_name namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace @@ -521,7 +518,7 @@ resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" { } resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" { - count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + count = (var.bastion_object_storage_par) ? 1 : 0 depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket] access_type = "AnyObjectWrite" bucket = local.rdma_nic_metric_bucket_name @@ -533,12 +530,12 @@ resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" { output "RDMA_NIC_metrics_url" { depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] - value = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : "" + value = (var.bastion_object_storage_par) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : "" } resource "local_file" "PAR" { - count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + count = (var.bastion_object_storage_par) ? 1 : 0 depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" filename = "${local.par_path}/PAR_file_for_metrics" diff --git a/bin/upload_rdma_nic_metrics.sh b/bin/upload_rdma_nic_metrics.sh index 6e44671c..59486baf 100644 --- a/bin/upload_rdma_nic_metrics.sh +++ b/bin/upload_rdma_nic_metrics.sh @@ -7,6 +7,7 @@ source "${folder}/rdma_metrics_collection_config.conf" hours="$hoursAgoFromNow" interval="$metricsCollectionIntervalInMinute" par_filename="$parFileName" +cluster_name="" if [ -z "$par_filename" ] then @@ -25,16 +26,17 @@ dis_help() echo echo "Usage:" echo - echo "./upload_rdma_nic_metrics.sh -l -i " + echo "./upload_rdma_nic_metrics.sh -l -i -c " echo echo "Options:" echo "l Hours Ago From Now (optional)" - echo "n Metrics Collection Interval In Minute (optional)" + echo "i Metrics Collection Interval In Minute (optional)" + echo "c Cluster Name (optional)" echo "h Print this help." echo echo "RDMA metrics are uploaded to Object Storage using PAR" echo - echo "e.g., sh ./upload_rdma_nic_metrics.sh -l 24 -i 5 " + echo "e.g., sh ./upload_rdma_nic_metrics.sh -l 24 -i 5 -c clusterName1" echo echo "Supported releases: 2.10.3+" echo @@ -43,11 +45,12 @@ dis_help() #Do this if number of arguments passed is greater than 0 if [ "$#" -gt "0" ] then - while getopts "l:i:h" option + while getopts "l:i:c:h" option do case $option in l) hours=${OPTARG};; i) interval=${OPTARG};; + c) cluster_name=${OPTARG};; h) dis_help exit;; \?) # Invalid option @@ -78,7 +81,7 @@ then filename="infiniband_mlx5_${i}_${timestamp}" filename_csv="${filename}.csv" - filename_zip="${filename}.zip" + filename_zip="${cluster_name}_${filename}.zip" echo "Collecting RDMA HW metrics of device mlx5_${i}...." query="SELECT * FROM ${measurementnameBackup}" @@ -123,7 +126,7 @@ then filename="infiniband_${timestamp}" filename_csv="${filename}.csv" - filename_zip="${filename}.zip" + filename_zip="${cluster_name}_${filename}.zip" echo "Collecting Infiniband counter metrics...." query="SELECT * FROM ${measurementnameBackup}" From e45fa4c494b5539bc0b81673d4d9589d53644f95 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Thu, 5 Oct 2023 10:14:29 -0700 Subject: [PATCH 21/68] Adding support for OCA HPC --- bastion.tf | 5 +++- cluster-network-configuration.tf | 28 ++++++++++++++++++- conf/variables.tpl | 1 + inventory.tpl | 3 +- locals.tf | 2 +- playbooks/destroy.yml | 3 +- playbooks/new_nodes.yml | 4 +-- playbooks/resize_add.yml | 4 +-- .../roles/nvidia-enroot/tasks/ubuntu.yml | 5 ++-- playbooks/site.yml | 4 +-- queues.conf | 2 ++ schema.yaml | 7 +++++ slurm_ha.tf | 5 +++- variables.tf | 1 + 14 files changed, 60 insertions(+), 14 deletions(-) diff --git a/bastion.tf b/bastion.tf index 3982a859..99606871 100644 --- a/bastion.tf +++ b/bastion.tf @@ -257,6 +257,7 @@ resource "null_resource" "cluster" { log_vol = var.log_vol, redundancy = var.redundancy, cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, slurm = var.slurm, rack_aware = var.rack_aware, slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path @@ -335,6 +336,7 @@ resource "null_resource" "cluster" { provisioner "file" { content = templatefile("${path.module}/queues.conf", { cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, compute_cluster = var.compute_cluster, marketplace_listing = var.marketplace_listing, image = local.image_ocid, @@ -427,7 +429,8 @@ resource "null_resource" "cluster" { bastion_username = var.bastion_username, compute_username = var.compute_username, pam = var.pam, - sacct_limits = var.sacct_limits + sacct_limits = var.sacct_limits, + use_compute_agent = var.use_compute_agent }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/cluster-network-configuration.tf b/cluster-network-configuration.tf index b66b1967..82a3fb60 100755 --- a/cluster-network-configuration.tf +++ b/cluster-network-configuration.tf @@ -22,8 +22,34 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati user_data = base64encode(data.template_file.config.rendered) } agent_config { - is_management_disabled = true + + are_all_plugins_disabled = false + is_management_disabled = true + is_monitoring_disabled = false + + plugins_config { + desired_state = "DISABLED" + name = "OS Management Service Agent" + } + dynamic plugins_config { + + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Authentication" + desired_state = plugins_config.value + } + } + dynamic plugins_config { + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Auto-Configuration" + desired_state = plugins_config.value + } + } + } + + shape = var.cluster_network_shape source_details { source_type = "image" diff --git a/conf/variables.tpl b/conf/variables.tpl index 29d47f15..9e55fb2f 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -34,6 +34,7 @@ variable "pyxis" { default = ${pyxis} } variable "pam" { default = ${pam} } variable "sacct_limits" { default = ${sacct_limits} } variable "enroot" { default = ${enroot} } +variable "use_compute_agent" { default = ${use_compute_agent} } variable "slurm_nfs_path" { default = "${slurm_nfs_path}" } variable "spack" { default = ${spack} } variable "instance_pool_ocpus" { default = "##OCPU##"} diff --git a/inventory.tpl b/inventory.tpl index 735e41b3..2bfd85d0 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -73,4 +73,5 @@ tenancy_ocid = ${tenancy_ocid} inst_prin = ${inst_prin} api_fingerprint = ${api_fingerprint} api_user_ocid = ${api_user_ocid} -sacct_limits=${sacct_limits} \ No newline at end of file +sacct_limits=${sacct_limits} +use_compute_agent=${use_compute_agent} \ No newline at end of file diff --git a/locals.tf b/locals.tf index 95db99fd..5866193e 100755 --- a/locals.tf +++ b/locals.tf @@ -42,7 +42,7 @@ locals { is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[] is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] - + bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" scratch_nfs_type = var.cluster_network ? var.scratch_nfs_type_cluster : var.scratch_nfs_type_pool diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 520b756d..46efb661 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -15,4 +15,5 @@ destroy: true initial: false roles: - - etc-hosts \ No newline at end of file + - etc-hosts + \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 83e57b9c..5732a758 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -48,10 +48,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 5703a1cf..2efa06af 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -46,10 +46,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index 65a700ad..2374ac25 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -32,12 +32,12 @@ - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" - - name: + - name: set enroot_top_path set_fact: enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - - name: + - name: set enroot_top_path set_fact: enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" @@ -116,3 +116,4 @@ - enroot_cache - enroot_runtime + \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index 0ee6eb8f..1b23e747 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -52,10 +52,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem diff --git a/queues.conf b/queues.conf index 1f985922..351846b2 100644 --- a/queues.conf +++ b/queues.conf @@ -21,6 +21,7 @@ targetCompartment: ${targetCompartment} boot_volume_size: ${boot_volume_size} use_marketplace_image: ${use_marketplace_image} + use_compute_agent: ${use_compute_agent} instance_pool_ocpus: ${instance_pool_ocpus} instance_pool_memory: ${instance_pool_memory} instance_pool_custom_memory: ${instance_pool_custom_memory} @@ -44,6 +45,7 @@ targetCompartment: ${targetCompartment} boot_volume_size: ${boot_volume_size} use_marketplace_image: ${use_marketplace_image} + use_compute_agent: ${use_compute_agent} instance_pool_ocpus: ${instance_pool_ocpus} instance_pool_memory: ${instance_pool_memory} instance_pool_custom_memory: ${instance_pool_custom_memory} diff --git a/schema.yaml b/schema.yaml index 1209b857..e7fe5b1c 100755 --- a/schema.yaml +++ b/schema.yaml @@ -60,6 +60,7 @@ variableGroups: - ${compute_username} - ${marketplace_listing} - ${unsupported} + - ${use_compute_agent} - ${compute_image_compartment} - ${image} - ${image_ocid} @@ -727,6 +728,12 @@ variables: default: "HPC_OL7" visible: ${use_marketplace_image} + use_compute_agent: + type: boolean + title: "use compute agent" + description: "Use compute agent" + default: false + compute_image_compartment: title: "compute image compartment" type: oci:identity:compartment:id diff --git a/slurm_ha.tf b/slurm_ha.tf index 0d90f75c..6db5a614 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -221,6 +221,7 @@ resource "null_resource" "cluster_backup" { log_vol = var.log_vol, redundancy = var.redundancy, cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, slurm = var.slurm, slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path, rack_aware = var.rack_aware, @@ -299,6 +300,7 @@ resource "null_resource" "cluster_backup" { provisioner "file" { content = templatefile("${path.module}/queues.conf", { cluster_network = var.cluster_network, + use_compute_agent = var.use_compute_agent, compute_cluster = var.compute_cluster, marketplace_listing = var.marketplace_listing, image = local.image_ocid, @@ -391,7 +393,8 @@ resource "null_resource" "cluster_backup" { private_deployment = var.private_deployment, bastion_username = var.bastion_username, compute_username = var.compute_username, - use_multiple_ads = var.use_multiple_ads + use_multiple_ads = var.use_multiple_ads, + use_compute_agent = var.use_compute_agent }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/variables.tf b/variables.tf index aea2eaac..9c64389a 100755 --- a/variables.tf +++ b/variables.tf @@ -37,6 +37,7 @@ variable "boot_volume_size" { default = 50 } variable "use_marketplace_image" { default = true} variable "image" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } +variable "use_compute_agent" { default = false } variable "unsupported_bastion_image" { default = "" } variable "unsupported_login_image" { default = "" } variable "use_cluster_nfs" { default = true} From 45232efb192a8b7d04ed0fd997b29c8f752b7e22 Mon Sep 17 00:00:00 2001 From: anoopna Date: Fri, 6 Oct 2023 22:46:33 +0530 Subject: [PATCH 22/68] Update bastion.sh --- bin/bastion.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index 6f0f5343..bb8b1933 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -122,7 +122,10 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then fi fi fix_apt - + sudo python3 -m pip install -U pip + sudo python3 -m pip install netaddr --upgrade + sudo python3 -m pip install requests --upgrade + sudo python3 -m pip install urllib3 --upgrade pip install pip --upgrade pip install pyopenssl --upgrade From 49272fc1ec40dead099302374595a1d439fbc089 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 9 Oct 2023 11:44:14 -0700 Subject: [PATCH 23/68] fixed indentation so that resize works for instance pools as well --- bin/resize.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bin/resize.py b/bin/resize.py index 48008763..4972a6e2 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -468,22 +468,22 @@ def get_summary(comp_ocid,cluster_name): if cn_summary_tmp.lifecycle_state == "ACTIVE" and cn_summary_tmp.display_name == cluster_name : cn_summary = cn_summary_tmp running_clusters = running_clusters + 1 + if running_clusters == 0: + cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data + if len(cn_summaries) > 0: + CN = "IP" + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 if running_clusters == 0: - cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data - if len(cn_summaries) > 0: - CN = "IP" - for cn_summary_tmp in cn_summaries: - if cn_summary_tmp.lifecycle_state == "RUNNING": - cn_summary = cn_summary_tmp - running_clusters = running_clusters + 1 - elif cn_summary_tmp.lifecycle_state == "SCALING": - scaling_clusters = scaling_clusters + 1 - if running_clusters == 0: - if scaling_clusters: - print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") - else: - print("The cluster was not found") - return None,None,True + if scaling_clusters: + print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") + else: + print("The cluster was not found") + return None,None,True if running_clusters > 1: print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) if CN == "CN": From ba363253229e162154a208ff451b603a24c82d84 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 10 Oct 2023 09:26:44 -0600 Subject: [PATCH 24/68] Fix Openldap Schema --- playbooks/roles/openldap/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/openldap/defaults/main.yml b/playbooks/roles/openldap/defaults/main.yml index e81c6cb4..815983e7 100644 --- a/playbooks/roles/openldap/defaults/main.yml +++ b/playbooks/roles/openldap/defaults/main.yml @@ -13,7 +13,7 @@ openldap_tls_cacrt: '{{ ssl_ca_cert }}' openldap_tls_crt: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.crt' openldap_tls_key: '{{ ssl_cert_path }}/{{ ansible_fqdn }}.key' -openldap_schemas_20: +openldap_schemas: - cosine - inetorgperson - rfc2307bis From 7b1e458b1a6a0b2181f2d9b03696efd4fcc98bf4 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Tue, 10 Oct 2023 12:36:08 -0700 Subject: [PATCH 25/68] Fix playbook execution conditional when using OCA --- playbooks/new_nodes.yml | 4 ++-- playbooks/resize_add.yml | 4 ++-- playbooks/site.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 5732a758..5d8d18d3 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -48,10 +48,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 2efa06af..a32fcba8 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -46,10 +46,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem diff --git a/playbooks/site.yml b/playbooks/site.yml index 1b23e747..caf0fcba 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -52,10 +52,10 @@ tasks: - include_role: name: oci-cn-auth - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: rdma-interface - when: cluster_network|bool and use_compute_agent|default(false)|bool + when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem From bc1d9a91d685d5778a6a78cd72a47040c5e170b7 Mon Sep 17 00:00:00 2001 From: Marcin Zablocki Date: Tue, 10 Oct 2023 12:39:43 -0700 Subject: [PATCH 26/68] Fix typo --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index e7fe5b1c..20a0a1e1 100755 --- a/schema.yaml +++ b/schema.yaml @@ -64,7 +64,7 @@ variableGroups: - ${compute_image_compartment} - ${image} - ${image_ocid} - - title: "Additionnal Login Node" + - title: "Additional Login Node" variables: - ${login_node} - ${login_ad} From 816029d28a5e59589f84e9e1d0ba49ffad2a44a1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 12 Oct 2023 15:25:06 -0600 Subject: [PATCH 27/68] Add Note about Slurm on Ubuntu 22.04 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f0fe81c8..15cf819a 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,7 @@ sleep 1000 - Instance Type: You can specify the OCI instance type that you’d like to run on as a constraint. This will make sure that you run on the right shape and also generate the right cluster. Instance types are defined in the `/opt/oci-hpc/conf/queues.conf` file in yml format. Leave all of the field in there even if they are not used. You can define multiple queues and multiple instance type in each queue. If you do not select an instance type when creating your job, it will use the default one. +- cpu-bind: On Ubuntu 22.04, we are switching to Cgroup v2 and we did notice that when hyperthreading is turned off. The default cpu-bind may give some issues. If you get an error like `error: task_g_set_affinity: Invalid argument`, you can try running your job with --cpu-bind=none or --cpu-bind=sockets ## Clusters folders: ``` /opt/oci-hpc/autoscaling/clusters/clustername From 1a5d4c753f769a1f44d8f4b500d3a77e5e07a927 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 12 Oct 2023 15:26:03 -0600 Subject: [PATCH 28/68] OpebLdap Schema fix --- playbooks/roles/openldap/tasks/debian.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 2b4dc2fc..6504c6eb 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -148,7 +148,7 @@ command: ldapadd -c -Y EXTERNAL -H ldapi:/// -Q -f /etc/ldap/schema/{{ item }}.ldif args: creates: '/etc/ldap/slapd.d/cn=config/cn=schema/cn={?}{{ item }}.ldif' - with_items: '{{ openldap_schemas_20 }}' + with_items: '{{ openldap_schemas }}' when: ansible_distribution_version == '20.04' - name: Load OpenLDAP schemas 22.04 From 2b257369dce381e8fb7c8373872ee470063d6e2b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 25 Oct 2023 11:03:46 -0600 Subject: [PATCH 29/68] Fix node ordering for changes in metadata --- playbooks/roles/rack-aware/files/node_ordering_by_rack.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index d86b2d07..027709e5 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -67,10 +67,9 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): for h in hosts: out = subprocess.run(["ssh "+h+" \"curl -s http://169.254.169.254/opc/v1/host/\""],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True) x = out.stdout.splitlines() - del x[-1] - del x[0] - rackId_str = x[1].split(":")[1].replace('"','') - rackId = rackId_str.replace(' ','') + json_str = ''.join(x) + json_data = json.loads(json_str) + rackId = json_data.get("rackId", None) if rackId in r: r[rackId].append( h ) else: From cab60deedc8abedcb62130c936e31ffe2d5a60dd Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 25 Oct 2023 11:03:58 -0600 Subject: [PATCH 30/68] Fix latency check --- playbooks/new_nodes.yml | 2 +- playbooks/resize_add.yml | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 5d8d18d3..3ff9969e 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -176,7 +176,7 @@ tasks: - include_role: name: latency_check - when: cluster_network|bool and not 'GPU' in shape + when: cluster_network|bool and latency_check|bool - hosts: all become: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index a32fcba8..092391b7 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -174,12 +174,6 @@ - include_role: name: tuned -- hosts: compute_to_add - tasks: - - include_role: - name: latency_check - when: cluster_network|bool and not 'GPU' in shape - - hosts: all become: true tasks: From fbb90e9e41c4383ac980af053f14cd62e1150b9b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Sun, 29 Oct 2023 23:11:38 -0600 Subject: [PATCH 31/68] Add Ping debug script --- samples/gpu/ping.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 samples/gpu/ping.sh diff --git a/samples/gpu/ping.sh b/samples/gpu/ping.sh new file mode 100644 index 00000000..25cf4797 --- /dev/null +++ b/samples/gpu/ping.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash +f=$(mktemp) +HOST=$1 +ssh $HOST /usr/sbin/ip -j addr | jq -r '.[] | select(.ifname | test("rdma")) | .ifname + " " + .addr_info[0].local' > $f +while read -r l ; do + i=$(echo $l | awk '{print $1}') + ip=$(echo $l | awk '{print $2}') + ping -qI $i $ip -c1 ; done < $f +rm -rf $f \ No newline at end of file From b8bbaa203c07184bbbcfa6a480df739c9d5359d2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Sun, 29 Oct 2023 23:13:52 -0600 Subject: [PATCH 32/68] Add roles for OCA update and RTTCC for H100 --- playbooks/new_nodes.yml | 6 +++ playbooks/resize_add.yml | 6 +++ .../roles/cloud-agent_update/tasks/el.yml | 35 ++++++++++++++++ .../roles/cloud-agent_update/tasks/main.yml | 4 ++ .../roles/cloud-agent_update/tasks/ubuntu.yml | 42 +++++++++++++++++++ playbooks/roles/rttcc/tasks/main.yml | 41 ++++++++++++++++++ playbooks/site.yml | 6 +++ 7 files changed, 140 insertions(+) create mode 100644 playbooks/roles/cloud-agent_update/tasks/el.yml create mode 100644 playbooks/roles/cloud-agent_update/tasks/main.yml create mode 100644 playbooks/roles/cloud-agent_update/tasks/ubuntu.yml create mode 100644 playbooks/roles/rttcc/tasks/main.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 3ff9969e..c8bbdfac 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -46,6 +46,9 @@ become: true gather_facts: true tasks: + - include_role: + name: cloud-agent_update + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: oci-cn-auth when: cluster_network|bool and not use_compute_agent|default(false)|bool @@ -171,6 +174,9 @@ when: enroot|default(true)|bool - include_role: name: tuned + - include_role: + name: rttcc + when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: compute tasks: diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 092391b7..5cba6928 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -44,6 +44,9 @@ become: true gather_facts: true tasks: + - include_role: + name: cloud-agent_update + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: oci-cn-auth when: cluster_network|bool and not use_compute_agent|default(false)|bool @@ -173,6 +176,9 @@ when: enroot|default(true)|bool - include_role: name: tuned + - include_role: + name: rttcc + when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: all become: true diff --git a/playbooks/roles/cloud-agent_update/tasks/el.yml b/playbooks/roles/cloud-agent_update/tasks/el.yml new file mode 100644 index 00000000..15869ed2 --- /dev/null +++ b/playbooks/roles/cloud-agent_update/tasks/el.yml @@ -0,0 +1,35 @@ +--- +- name: Check oracle cloud agent version + shell: "yum info oracle-cloud-agent | grep Version | awk '{print $3}'" + register: version + when: use_compute_agent | bool + +- name: Install OCA v1.37 for OL8 + vars: + - major_version: "{{version.stdout.split('.')[1] }}" + - minor_version: "{{version.stdout.split('.')[0] }}" + yum: + name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/Yew5fbqcx5j-H_W_0BKHut2yEMb4DJ0u5LUEttGCcCB_F_5a1GnmaGK1wyksue1z/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent-1.37.0-10347.el8.x86_64.rpm" + state: present + disable_gpg_check: yes + when: + - ansible_os_family == 'RedHat' + - ansible_distribution_major_version == '8' + - (minor_version | int <= 1) | bool + - (major_version | int < 37) | bool + - use_compute_agent | bool + +- name: Install OCA v1.37 for OL7 + vars: + - major_version: "{{version.stdout.split('.')[1] }}" + - minor_version: "{{version.stdout.split('.')[0] }}" + yum: + name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/97KluWDRcyG1IBdhGvCbKGkrq6TbGefawT8nJM5ko9SAOVfeSL9AOLk_UQCX0hs4/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent-1.37.0-10347.el7.x86_64.rpm" + state: present + disable_gpg_check: yes + when: + - ansible_os_family == 'RedHat' + - ansible_distribution_major_version == '7' + - (minor_version | int <= 1) | bool + - (major_version | int < 37) | bool + - use_compute_agent | bool \ No newline at end of file diff --git a/playbooks/roles/cloud-agent_update/tasks/main.yml b/playbooks/roles/cloud-agent_update/tasks/main.yml new file mode 100644 index 00000000..ea4d5d2a --- /dev/null +++ b/playbooks/roles/cloud-agent_update/tasks/main.yml @@ -0,0 +1,4 @@ +- include: el.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' +- include: ubuntu.yml + when: ansible_os_family == 'Debian' diff --git a/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml b/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml new file mode 100644 index 00000000..7bbf25ff --- /dev/null +++ b/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml @@ -0,0 +1,42 @@ +--- +- name: Check oracle cloud agent version + shell: "snap info oracle-cloud-agent | grep installed | awk '{print $2}'" + register: version + when : use_compute_agent | bool + +- name: Download Snap package + vars: + - major_version: "{{version.stdout.split('.')[1] }}" + - minor_version: "{{version.stdout.split('.')[0] }}" + get_url: + url: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/OySIUys5Zz0uFz-3XIWvpYKIM90gLWif7TRUBh2jUQd0R_bNyzlt1WzrkdJYfvYY/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent_1.37.0-2_amd64.snap" + dest: "/tmp/oracle-cloud-agent_1.37.0-2_amd64.snap" + when : + - (minor_version | int <= 1) | bool + - (major_version | int < 37) | bool + - use_compute_agent | bool + + ## The ansible snap module is not upgrading properly if the package already exists. + # - name: Install OCA snap v1.37 + # vars: + # - major_version: "{{version.stdout.split('.')[1] }}" + # - minor_version: "{{version.stdout.split('.')[0] }}" + # become: true + # community.general.snap: + # classic: true + # dangerous: true + # name: "/tmp/oracle-cloud-agent_1.37.0-2_amd64.snap" + # when : + # - (minor_version | int <= 1) | bool and (minor_version | int < 37) | bool + # - use_compute_agent | bool + +- name: Snap update + vars: + - major_version: "{{version.stdout.split('.')[1] }}" + - minor_version: "{{version.stdout.split('.')[0] }}" + become: true + shell: "snap install --classic --dangerous /tmp/oracle-cloud-agent_1.37.0-2_amd64.snap" + when : + - (minor_version | int <= 1) | bool + - (major_version | int < 37) | bool + - use_compute_agent | bool \ No newline at end of file diff --git a/playbooks/roles/rttcc/tasks/main.yml b/playbooks/roles/rttcc/tasks/main.yml new file mode 100644 index 00000000..cb71e767 --- /dev/null +++ b/playbooks/roles/rttcc/tasks/main.yml @@ -0,0 +1,41 @@ +- name: Enable dcqcn + become: true + shell: 'mlxreg -d {{item}} -y --set "cmd_type=2" --reg_name=PPCC --indexes="local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0"' + with_items: + - mlx5_0 + - mlx5_1 + - mlx5_3 + - mlx5_4 + - mlx5_5 + - mlx5_6 + - mlx5_7 + - mlx5_8 + - mlx5_9 + - mlx5_10 + - mlx5_12 + - mlx5_13 + - mlx5_14 + - mlx5_15 + - mlx5_16 + - mlx5_17 + +- name: Disable rttcc + become: true + shell: 'mlxreg -d {{item}} -y --set "cmd_type=1" --reg_name=PPCC --indexes="local_port=1,pnat=0,lp_msb=0,algo_slot=15,algo_param_index=0"' + with_items: + - mlx5_0 + - mlx5_1 + - mlx5_3 + - mlx5_4 + - mlx5_5 + - mlx5_6 + - mlx5_7 + - mlx5_8 + - mlx5_9 + - mlx5_10 + - mlx5_12 + - mlx5_13 + - mlx5_14 + - mlx5_15 + - mlx5_16 + - mlx5_17 \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index caf0fcba..18a191b9 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -50,6 +50,9 @@ become: true gather_facts: true tasks: + - include_role: + name: cloud-agent_update + when: cluster_network|bool and use_compute_agent|default(false)|bool - include_role: name: oci-cn-auth when: cluster_network|bool and not use_compute_agent|default(false)|bool @@ -290,6 +293,9 @@ when: enroot|default(true)|bool - include_role: name: tuned + - include_role: + name: rttcc + when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: all become: true From 125e4359852a8ca765c48a5bde3a9a06fc8a6d0e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Sun, 29 Oct 2023 23:14:15 -0600 Subject: [PATCH 33/68] Add H100 Topology --- samples/gpu/H100-topology.xml | 167 ++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 samples/gpu/H100-topology.xml diff --git a/samples/gpu/H100-topology.xml b/samples/gpu/H100-topology.xml new file mode 100644 index 00000000..3dcce372 --- /dev/null +++ b/samples/gpu/H100-topology.xml @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 49212c451973949f08508fea7961845d58c206de Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 30 Oct 2023 12:17:41 -0600 Subject: [PATCH 34/68] H100 NCCL test file --- samples/gpu/nccl_run_allreduce_H100.sbatch | 59 ++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 samples/gpu/nccl_run_allreduce_H100.sbatch diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch new file mode 100644 index 00000000..1addffa3 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_H100.sbatch @@ -0,0 +1,59 @@ +#!/bin/bash +#SBATCH --job-name=nccl-allreduce-slurm +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --exclusive +export PMI_DEBUG=1 + + +cd /nfs/scratch +mkdir $SLURM_JOB_ID +cd $SLURM_JOB_ID + +MACHINEFILE="hostfile" + +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE +echo MACHINEFILE +cat $MACHINEFILE + +source /etc/os-release + +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path + +export NCCL_DEBUG=WARN + +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.H100.8\" ] +then + var_UCX_NET_DEVICES=eth0 + var_NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17" +else + echo "Use the appropriate nccl test run script for non H100 nodes" +fi + + mpirun --mca pml ucx \ + --bind-to numa \ + --mca coll ^hcoll \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_QPS_PER_CONNECTION=16 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x coll_hcoll_enable=0 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_ALGO=Ring \ + -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ + -x NCCL_TOPO_FILE=~/H100-topology.xml \ + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file From 9ef7d9d2b69e91acf144e241f50c7722c4f1a42d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 30 Oct 2023 12:18:12 -0600 Subject: [PATCH 35/68] Move the H100-topology file to home --- samples/prep_sample_files.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/prep_sample_files.sh b/samples/prep_sample_files.sh index 6c7177d2..2c1f43a8 100644 --- a/samples/prep_sample_files.sh +++ b/samples/prep_sample_files.sh @@ -9,5 +9,6 @@ done; cp nccl_compile/compile.sh ~ cp gpu/*.sbatch ~ +cp gpu/H100* ~ cp /opt/oci-hpc/bin/node_ordering_by_rack.py ~ From cb2546171a5da097fbf11392ae3665c4c22ff5d1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 30 Oct 2023 12:21:38 -0600 Subject: [PATCH 36/68] Change Ring to Auto --- samples/gpu/nccl_run_allreduce_H100.sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch index 1addffa3..164751b8 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sbatch +++ b/samples/gpu/nccl_run_allreduce_H100.sbatch @@ -53,7 +53,7 @@ fi -x HCOLL_ENABLE_MCAST_ALL=0 \ -x coll_hcoll_enable=0 \ -x NCCL_IB_GID_INDEX=3 \ - -x NCCL_ALGO=Ring \ + -x NCCL_ALGO=Auto \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ -x NCCL_TOPO_FILE=~/H100-topology.xml \ --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file From 2a96aa7e8885fa36fd6b21474a267f80c7ae763c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:39:52 -0600 Subject: [PATCH 37/68] Adding Topology for containers --- samples/gpu/H100-topology-container.xml | 166 ++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 samples/gpu/H100-topology-container.xml diff --git a/samples/gpu/H100-topology-container.xml b/samples/gpu/H100-topology-container.xml new file mode 100644 index 00000000..85975887 --- /dev/null +++ b/samples/gpu/H100-topology-container.xml @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 42c0ce3f8bf3082087482861e98853b803a614f6 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:40:06 -0600 Subject: [PATCH 38/68] Update task name --- playbooks/roles/rdma-interface/tasks/debian.yml | 2 +- playbooks/roles/rdma-interface/tasks/el.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/rdma-interface/tasks/debian.yml b/playbooks/roles/rdma-interface/tasks/debian.yml index 42d28db8..cfc85ba0 100644 --- a/playbooks/roles/rdma-interface/tasks/debian.yml +++ b/playbooks/roles/rdma-interface/tasks/debian.yml @@ -4,7 +4,7 @@ # Another option is to detect the mlx5_core driver, but it's harder to tell whch interface can be activated # -- name: Are we running on a new image? +- name: Are we running on an image with OCI-CN-AUTH packages? stat: path: /sbin/oci-rdma-configure register: new_image diff --git a/playbooks/roles/rdma-interface/tasks/el.yml b/playbooks/roles/rdma-interface/tasks/el.yml index 21d02c50..395a227a 100755 --- a/playbooks/roles/rdma-interface/tasks/el.yml +++ b/playbooks/roles/rdma-interface/tasks/el.yml @@ -17,7 +17,7 @@ instance_metadata: "{{ i_metadata['content'] }}" -- name: Are we running on a new image? +- name: Are we running on an image with OCI-CN-AUTH packages? stat: path: /sbin/oci-rdma-configure register: new_image From 7c166cfbe79afe00183cd4027df2568e9b7eee07 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:40:29 -0600 Subject: [PATCH 39/68] Update agent to v1.37.2 if version is lower --- playbooks/roles/cloud-agent_update/tasks/el.yml | 12 ++++++++---- .../roles/cloud-agent_update/tasks/ubuntu.yml | 15 ++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/playbooks/roles/cloud-agent_update/tasks/el.yml b/playbooks/roles/cloud-agent_update/tasks/el.yml index 15869ed2..60211cd3 100644 --- a/playbooks/roles/cloud-agent_update/tasks/el.yml +++ b/playbooks/roles/cloud-agent_update/tasks/el.yml @@ -8,28 +8,32 @@ vars: - major_version: "{{version.stdout.split('.')[1] }}" - minor_version: "{{version.stdout.split('.')[0] }}" + - sub_version: "{{version.stdout.split('.')[2].split('-')[0] }}" yum: - name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/Yew5fbqcx5j-H_W_0BKHut2yEMb4DJ0u5LUEttGCcCB_F_5a1GnmaGK1wyksue1z/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent-1.37.0-10347.el8.x86_64.rpm" + name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/aV_mSl96KIiapAeZtsyo-SUcPCSurDfWaj06f4XVVoNKIsxvqlZ65guPTnMuNawR/n/imagegen/b/agent_test/o/1.37.0/3/oracle-cloud-agent-1.37.2-10459.el8.x86_64.rpm" state: present disable_gpg_check: yes when: - ansible_os_family == 'RedHat' - ansible_distribution_major_version == '8' - (minor_version | int <= 1) | bool - - (major_version | int < 37) | bool + - (major_version | int <= 37) | bool + - (sub_version | int < 2) | bool - use_compute_agent | bool - name: Install OCA v1.37 for OL7 vars: - major_version: "{{version.stdout.split('.')[1] }}" - minor_version: "{{version.stdout.split('.')[0] }}" + - sub_version: "{{version.stdout.split('.')[2].split('-')[0] }}" yum: - name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/97KluWDRcyG1IBdhGvCbKGkrq6TbGefawT8nJM5ko9SAOVfeSL9AOLk_UQCX0hs4/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent-1.37.0-10347.el7.x86_64.rpm" + name: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/YmPlysZFl4CKrLTKN9Rj0CMPt8qiJgflvF4vXsOaaqOfcm5NMnyBJl_dlC0V0lTo/n/imagegen/b/agent_test/o/1.37.0/3/oracle-cloud-agent-1.37.2-10459.el7.x86_64.rpm" state: present disable_gpg_check: yes when: - ansible_os_family == 'RedHat' - ansible_distribution_major_version == '7' - (minor_version | int <= 1) | bool - - (major_version | int < 37) | bool + - (major_version | int <= 37) | bool + - (sub_version | int < 2) | bool - use_compute_agent | bool \ No newline at end of file diff --git a/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml b/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml index 7bbf25ff..813e0859 100644 --- a/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml +++ b/playbooks/roles/cloud-agent_update/tasks/ubuntu.yml @@ -8,12 +8,14 @@ vars: - major_version: "{{version.stdout.split('.')[1] }}" - minor_version: "{{version.stdout.split('.')[0] }}" + - sub_version: "{{version.stdout.split('.')[2].split('-')[0] }}" get_url: - url: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/OySIUys5Zz0uFz-3XIWvpYKIM90gLWif7TRUBh2jUQd0R_bNyzlt1WzrkdJYfvYY/n/imagegen/b/agent_test/o/1.37.0/1/oracle-cloud-agent_1.37.0-2_amd64.snap" - dest: "/tmp/oracle-cloud-agent_1.37.0-2_amd64.snap" + url: "https://objectstorage.us-phoenix-1.oraclecloud.com/p/KVgO3DVFyTiLdCJLZUNYZlYCF5mcnaTjemmMUmyOB1Ln7Vkii-llp7QoQDiRF66T/n/imagegen/b/agent_test/o/1.37.0/4/oracle-cloud-agent_1.37.2-9_amd64.snap" + dest: "/tmp/oracle-cloud-agent_1.37.2-9_amd64.snap" when : - (minor_version | int <= 1) | bool - - (major_version | int < 37) | bool + - (major_version | int <= 37) | bool + - (sub_version | int < 2) | bool - use_compute_agent | bool ## The ansible snap module is not upgrading properly if the package already exists. @@ -34,9 +36,12 @@ vars: - major_version: "{{version.stdout.split('.')[1] }}" - minor_version: "{{version.stdout.split('.')[0] }}" + - sub_version: "{{version.stdout.split('.')[2].split('-')[0] }}" + become: true - shell: "snap install --classic --dangerous /tmp/oracle-cloud-agent_1.37.0-2_amd64.snap" + shell: "snap install --classic --dangerous /tmp/oracle-cloud-agent_1.37.2-9_amd64.snap" when : - (minor_version | int <= 1) | bool - - (major_version | int < 37) | bool + - (major_version | int <= 37) | bool + - (sub_version | int < 2) | bool - use_compute_agent | bool \ No newline at end of file From 8cc04227c3a97137ef3d5c8039e36567c2ec88ca Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:41:00 -0600 Subject: [PATCH 40/68] Remove RTTCC disabling role. Now part of Agent --- playbooks/new_nodes.yml | 3 --- playbooks/resize_add.yml | 3 --- playbooks/site.yml | 3 --- 3 files changed, 9 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index c8bbdfac..3be0cb57 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -174,9 +174,6 @@ when: enroot|default(true)|bool - include_role: name: tuned - - include_role: - name: rttcc - when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: compute tasks: diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 5cba6928..e3070b40 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -176,9 +176,6 @@ when: enroot|default(true)|bool - include_role: name: tuned - - include_role: - name: rttcc - when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: all become: true diff --git a/playbooks/site.yml b/playbooks/site.yml index 18a191b9..975fdbbf 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -293,9 +293,6 @@ when: enroot|default(true)|bool - include_role: name: tuned - - include_role: - name: rttcc - when: shape == "BM.GPU.H100.8" and cluster_network|bool - hosts: all become: true From 9c6d80852234d26a8fb89b29137f29d1f4a58f45 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:41:45 -0600 Subject: [PATCH 41/68] Change agent box position and default disk size --- schema.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/schema.yaml b/schema.yaml index 20a0a1e1..a761f65c 100755 --- a/schema.yaml +++ b/schema.yaml @@ -56,11 +56,11 @@ variableGroups: - ${node_count} - ${hyperthreading} - ${boot_volume_size} + - ${use_compute_agent} - ${use_marketplace_image} - ${compute_username} - ${marketplace_listing} - ${unsupported} - - ${use_compute_agent} - ${compute_image_compartment} - ${image} - ${image_ocid} @@ -446,7 +446,7 @@ variables: required: true minimum: 50 title: "Size of the boot volume in GB" - default: 50 + default: 100 bastion_boot_volume_backup: type: boolean @@ -706,7 +706,7 @@ variables: required: true minimum: 50 title: "Size of the boot volume in GB" - default: 50 + default: 100 description: "Boot volume size in GB of each compute node" use_marketplace_image: @@ -731,8 +731,8 @@ variables: use_compute_agent: type: boolean title: "use compute agent" - description: "Use compute agent" - default: false + description: "Select if your image has the OCA agent rather than the oci-cn-auth package. The marketplace image does not use the compute agent for now." + default: true compute_image_compartment: title: "compute image compartment" From 85086533483ab7bf34d1df3f17f48ec2bab79907 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:42:44 -0600 Subject: [PATCH 42/68] Change default disk size --- conf/queues.conf.example | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/queues.conf.example b/conf/queues.conf.example index fe7d2aa0..fe187437 100644 --- a/conf/queues.conf.example +++ b/conf/queues.conf.example @@ -19,7 +19,7 @@ private_subnet_id: ocid1.subnet.oc1..aaaaaaaaf5greyx5b45hkh7rwm2osef7t5lofl5tag5uijjqlhw6duaaaaa #TO EDIT image: ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa #TO EDIT targetCompartment: ocid1.compartment.oc1.. #TO EDIT - boot_volume_size: 50 + boot_volume_size: 100 use_marketplace_image: true instance_pool_ocpus: 2 instance_pool_memory: 16 @@ -42,7 +42,7 @@ private_subnet_id: ocid1.subnet.oc1..aaaaaaaaf5greyx5b45hkh7rwm2osef7t5lofl5tag5uijjqlhw6duaaaaa #TO EDIT image: ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa #TO EDIT targetCompartment: ocid1.compartment.oc1.. #TO EDIT - boot_volume_size: 50 + boot_volume_size: 100 use_marketplace_image: true instance_pool_ocpus: 2 instance_pool_memory: 16 @@ -67,7 +67,7 @@ private_subnet_id: ocid1.subnet.oc1..aaaaaaaaf5greyx5b45hkh7rwm2osef7t5lofl5tag5uijjqlhw6duaaaaa #TO EDIT image: ocid1.image.oc1.eu-frankfurt-1.aaaaaaaasrw572lodukbaxlwrrq5zcqe3zldh3krkgtq3lm463lann4ksxeq #TO EDIT targetCompartment: ocid1.compartment.oc1.. #TO EDIT - boot_volume_size: 50 + boot_volume_size: 100 use_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 16 @@ -89,7 +89,7 @@ private_subnet_id: ocid1.subnet.oc1..aaaaaaaaf5greyx5b45hkh7rwm2osef7t5lofl5tag5uijjqlhw6duaaaaa #TO EDIT image: ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa #TO EDIT targetCompartment: ocid1.compartment.oc1.. #TO EDIT - boot_volume_size: 50 + boot_volume_size: 100 use_marketplace_image: false instance_pool_ocpus: 2 instance_pool_memory: 2 From b08ea5f79c7bbbd6c882eb340ceedd33b1d79f12 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 1 Nov 2023 20:43:07 -0600 Subject: [PATCH 43/68] Always create the ansible conf file --- bin/bastion.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index bb8b1933..b0f2d15f 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -177,14 +177,15 @@ if [ ! -d /etc/ansible ] ; then else sudo chown opc:opc /etc/ansible fi - ansible-config init --disabled -t all > /etc/ansible/ansible.cfg fi +ansible-config init --disabled -t all | sudo tee /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)forks.*/forks = ${forks}/" /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)fact_caching=.*/fact_caching=jsonfile/" /etc/ansible/ansible.cfg -sudo sed -i "s/^\(#\|;\)fact_caching_connection.*/fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg +sudo sed -i "0,/^\(#\|;\)fact_caching_connection.*/s//fact_caching_connection=\/tmp\/ansible/" /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)retries.*/retries=5/" /etc/ansible/ansible.cfg sudo sed -i "s/^\(#\|;\)connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg -sudo sed -i "s/^\(#\|;\)command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg \ No newline at end of file +sudo sed -i "s/^\(#\|;\)command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg + From 44afdcee6af7eafe82728de881aac87037daf74a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 9 Nov 2023 15:39:08 -0700 Subject: [PATCH 44/68] Fix small error in topology without rack aware --- playbooks/roles/slurm/tasks/compute.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 56666f8f..6dbf46cc 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -116,7 +116,7 @@ - name: Get nodes from Inactive Switch vars: - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}}\"" + shell: "cat {{ slurm_conf_path }}/topology.conf | grep \"SwitchName=inactive-{{queue}}-{{keyword}} \"" register: inactive_switch run_once: true delegate_to: 127.0.0.1 From 23ff3c66b0eb241d57d10dac7c267f65d566d618 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 27 Nov 2023 13:26:55 -0700 Subject: [PATCH 45/68] Verify the computeclusters are included in API ver --- bin/resize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/resize.py b/bin/resize.py index 4972a6e2..5faf6273 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -461,7 +461,11 @@ def get_summary(comp_ocid,cluster_name): elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 if running_clusters == 0: - cn_summaries = computeClient.list_compute_clusters(comp_ocid,display_name=cluster_name).data.items + try: + cn_summaries = computeClient.list_compute_clusters(comp_ocid,display_name=cluster_name).data.items + except: + print("The list_compute_clusters call returned an error, considering no Compute CLusters are present") + cn_summaries = [] if len(cn_summaries) > 0: CN = "CC" for cn_summary_tmp in cn_summaries: From 24a6c23bd9dceb23607913e398bba353e12564f9 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 27 Nov 2023 13:32:06 -0700 Subject: [PATCH 46/68] Add E5 Support --- playbooks/roles/slurm/templates/slurm.conf.j2 | 4 +++ schema.yaml | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 0fbc07c1..0d61eb25 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -105,6 +105,8 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.E4.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "VM.Standard.E5.Flex" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Optimized3.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard3.Flex" %} @@ -129,6 +131,8 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC.E5.144" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=72 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.Standard.E5.192" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=96 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Optimized3.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif "VM.Standard2." in instance.shape %} diff --git a/schema.yaml b/schema.yaml index a761f65c..be0526d1 100755 --- a/schema.yaml +++ b/schema.yaml @@ -284,6 +284,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" + - eq: + - ${bastion_shape} + - "VM.Standard.E5.Flex" - eq: - ${bastion_shape} - "VM.Optimized3.Flex" @@ -328,6 +331,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" + - eq: + - ${bastion_shape} + - "VM.Standard.E5.Flex" - eq: - ${bastion_shape} - "VM.Standard.A1.Flex" @@ -354,6 +360,9 @@ variables: - eq: - ${bastion_shape} - "VM.Standard.E4.Flex" + - eq: + - ${bastion_shape} + - "VM.Standard.E5.Flex" - eq: - ${bastion_shape} - "VM.Standard.A1.Flex" @@ -607,6 +616,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Standard.E5.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.A1.Flex" @@ -650,6 +662,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Standard.E5.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.A1.Flex" @@ -676,6 +691,9 @@ variables: - eq: - ${instance_pool_shape} - "VM.Standard.E4.Flex" + - eq: + - ${instance_pool_shape} + - "VM.Standard.E5.Flex" - eq: - ${instance_pool_shape} - "VM.Standard.A1.Flex" @@ -1348,6 +1366,9 @@ variables: - eq: - ${login_shape} - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E5.Flex" - eq: - ${login_shape} - "VM.Optimized3.Flex" @@ -1394,6 +1415,9 @@ variables: - eq: - ${login_shape} - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E5.Flex" - eq: - ${login_shape} - "VM.Standard.A1.Flex" @@ -1421,6 +1445,9 @@ variables: - eq: - ${login_shape} - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E5.Flex" - eq: - ${login_shape} - "VM.Standard.A1.Flex" From 136b8862428a05f4b11d4d07fd2543bafc608f57 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 27 Nov 2023 13:33:25 -0700 Subject: [PATCH 47/68] Fix indent --- bin/bastion.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index b0f2d15f..2f1de7ad 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -51,7 +51,7 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then fi sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo sudo yum install -y terraform - sudo python3 -m pip install oci-cli --upgrade + sudo python3 -m pip install oci-cli --upgrade elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then From a2fb7f5a361ca3b5732efccb31a0b443a9ae896d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Nov 2023 12:34:51 -0700 Subject: [PATCH 48/68] Fix OCI-CLI install on OL79 --- bin/bastion.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index 2f1de7ad..501cc559 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -43,14 +43,14 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then sudo mkdir /etc/ansible sudo ln -s /usr/local/bin/ansible-playbook /bin/ansible-playbook sudo ln -s /usr/local/bin/ansible /bin/ansible - sudo python3 -m pip install -U pip - sudo python3 -m pip install netaddr --upgrade - sudo python3 -m pip install setuptools_rust --upgrade - sudo python3 -m pip install requests --upgrade - sudo python3 -m pip install urllib3 --upgrade fi sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo sudo yum install -y terraform + sudo python3 -m pip install -U pip + sudo python3 -m pip install netaddr --upgrade + sudo python3 -m pip install setuptools_rust --upgrade + sudo python3 -m pip install requests --upgrade + sudo python3 -m pip install urllib3 --upgrade sudo python3 -m pip install oci-cli --upgrade From 94649f24882722674d91a7985415c70c1a9f2cd8 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 30 Nov 2023 19:02:26 -0700 Subject: [PATCH 49/68] Update PIP3 on all nodes --- playbooks/roles/packages/tasks/centos-7.yml | 10 +++++++++- playbooks/roles/packages/tasks/debian.yml | 7 +++++++ playbooks/roles/packages/tasks/el-7.yml | 8 ++++++++ playbooks/roles/packages/tasks/ol-7.yml | 8 ++++++++ playbooks/roles/packages/tasks/ol-8.yml | 12 +++++++++++- playbooks/roles/packages/tasks/ubuntu-2204.yml | 10 +++++++++- playbooks/roles/packages/tasks/ubuntu.yml | 10 +++++++++- 7 files changed, 61 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/packages/tasks/centos-7.yml b/playbooks/roles/packages/tasks/centos-7.yml index 30a8dace..bfccaf3c 100644 --- a/playbooks/roles/packages/tasks/centos-7.yml +++ b/playbooks/roles/packages/tasks/centos-7.yml @@ -9,4 +9,12 @@ - python3-pip package_state: latest include_role: - name: safe_yum \ No newline at end of file + name: safe_yum + +- name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/debian.yml b/playbooks/roles/packages/tasks/debian.yml index d3911656..b6808410 100644 --- a/playbooks/roles/packages/tasks/debian.yml +++ b/playbooks/roles/packages/tasks/debian.yml @@ -13,3 +13,10 @@ include_role: name: safe_yum +- name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml index 793cd912..d0c23143 100755 --- a/playbooks/roles/packages/tasks/el-7.yml +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -13,6 +13,14 @@ name: safe_yum ignore_errors: true +- name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes + - name: install oci-cli latest version become: true pip: diff --git a/playbooks/roles/packages/tasks/ol-7.yml b/playbooks/roles/packages/tasks/ol-7.yml index f3380d49..7159eee4 100644 --- a/playbooks/roles/packages/tasks/ol-7.yml +++ b/playbooks/roles/packages/tasks/ol-7.yml @@ -16,6 +16,14 @@ include_role: name: safe_yum +- name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes + - name: install oci-cli latest version become: true pip: diff --git a/playbooks/roles/packages/tasks/ol-8.yml b/playbooks/roles/packages/tasks/ol-8.yml index b3733379..ad1d9877 100644 --- a/playbooks/roles/packages/tasks/ol-8.yml +++ b/playbooks/roles/packages/tasks/ol-8.yml @@ -17,6 +17,15 @@ name: safe_yum ignore_errors: true + +- name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes + - name: install oci-cli latest version become: true pip: @@ -24,4 +33,5 @@ state: latest executable: pip3 ignore_errors: yes - when: ('bastion' in group_names) \ No newline at end of file + when: ('bastion' in group_names) + diff --git a/playbooks/roles/packages/tasks/ubuntu-2204.yml b/playbooks/roles/packages/tasks/ubuntu-2204.yml index a3b9541a..cb11d80b 100644 --- a/playbooks/roles/packages/tasks/ubuntu-2204.yml +++ b/playbooks/roles/packages/tasks/ubuntu-2204.yml @@ -20,4 +20,12 @@ package_state: latest include_role: name: safe_yum - ignore_errors: true \ No newline at end of file + ignore_errors: true + + - name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/ubuntu.yml b/playbooks/roles/packages/tasks/ubuntu.yml index 408e6075..8c091071 100644 --- a/playbooks/roles/packages/tasks/ubuntu.yml +++ b/playbooks/roles/packages/tasks/ubuntu.yml @@ -14,4 +14,12 @@ package_state: latest include_role: name: safe_yum - ignore_errors: true \ No newline at end of file + ignore_errors: true + + - name: Upgrade Pip3 + become: true + pip: + name: [pip] + state: latest + executable: pip3 + ignore_errors: yes \ No newline at end of file From 8dde227478ae33446de6d9f9f17b8e68f2e926b1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 4 Dec 2023 18:22:02 -0700 Subject: [PATCH 50/68] Update cloud-agent to keep all params in config --- playbooks/roles/oci-cloud-agent/tasks/el.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/oci-cloud-agent/tasks/el.yml b/playbooks/roles/oci-cloud-agent/tasks/el.yml index 3376bd62..5b4993ed 100644 --- a/playbooks/roles/oci-cloud-agent/tasks/el.yml +++ b/playbooks/roles/oci-cloud-agent/tasks/el.yml @@ -17,14 +17,14 @@ - name: Create configuration for core - modify params set_fact: - mydata: "{{ mydata | combine(newdata) }}" + mydata: "{{ mydata | combine(newdata, recursive=True) }}" vars: newdata: plugins: osms: disabled: true when: agent_file.stat.exists - + - name: Write back to a file copy: content: '{{ mydata | to_nice_yaml }}' From 7a26c6d480cc99efc838f23985dfde542b078d78 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 5 Dec 2023 15:43:31 -0700 Subject: [PATCH 51/68] Propagate Cloud agnet to new clusters --- autoscaling/tf_init/bastion_update.tf | 3 ++- .../tf_init/cluster-network-configuration.tf | 26 ++++++++++++++++++- autoscaling/tf_init/inventory.tpl | 3 ++- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 3205d2d2..efba3245 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -72,7 +72,8 @@ resource "local_file" "inventory" { bastion_username = var.bastion_username, compute_username = var.compute_username, pam = var.pam, - sacct_limits = var.sacct_limits + sacct_limits = var.sacct_limits, + use_compute_agent=var.use_compute_agent }) filename = "${local.bastion_path}/inventory" } diff --git a/autoscaling/tf_init/cluster-network-configuration.tf b/autoscaling/tf_init/cluster-network-configuration.tf index e4f75bd7..9b1d0972 100755 --- a/autoscaling/tf_init/cluster-network-configuration.tf +++ b/autoscaling/tf_init/cluster-network-configuration.tf @@ -18,8 +18,32 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati user_data = base64encode(data.template_file.config.rendered) } agent_config { - is_management_disabled = true + + are_all_plugins_disabled = false + is_management_disabled = true + is_monitoring_disabled = false + + plugins_config { + desired_state = "DISABLED" + name = "OS Management Service Agent" + } + dynamic plugins_config { + + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Authentication" + desired_state = plugins_config.value + } + } + dynamic plugins_config { + for_each = var.use_compute_agent ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute HPC RDMA Auto-Configuration" + desired_state = plugins_config.value + } + } + } shape = var.cluster_network_shape source_details { source_type = "image" diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 261c1e17..873ff0b0 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -68,4 +68,5 @@ latency_check=${latency_check} compute_username=${compute_username} bastion_username=${bastion_username} pam = ${pam} -sacct_limits=${sacct_limits} \ No newline at end of file +sacct_limits=${sacct_limits} +use_compute_agent=${use_compute_agent} \ No newline at end of file From a39df423bdab36467b8409725eac28e8825e314a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 5 Dec 2023 15:47:43 -0700 Subject: [PATCH 52/68] Change RDMA CIDR --- schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schema.yaml b/schema.yaml index be0526d1..c3559e3a 100755 --- a/schema.yaml +++ b/schema.yaml @@ -999,8 +999,8 @@ variables: rdma_subnet: type: string title: "RDMA subnet IP range" - default: "192.168.0.0/16" - description: "Must be at least the same size as private subnet for HPC and at least 16 times the size of the private subnet for GPUs" + default: "10.224.0.0/12" + description: "Must be at least the same size as private subnet for HPC and at least 16 times the size of the private subnet for GPUs, currently cannnot be modified with the compute agent" required: true private_subnet: type: string From 518fcd6732d30d80209cb8c1c46092f2a3547671 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 6 Dec 2023 14:47:52 -0700 Subject: [PATCH 53/68] Update to the new images --- variables.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/variables.tf b/variables.tf index 9c64389a..c8b8afaf 100755 --- a/variables.tf +++ b/variables.tf @@ -91,10 +91,10 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" - "GPU_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" - "GPU_OL8" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-0" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-0" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-0" } } From 0c802d6a18820b3453bc2479b3415ca93eb04f81 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 6 Dec 2023 14:49:18 -0700 Subject: [PATCH 54/68] Chnage comment about compute_agent --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index c3559e3a..9008fb83 100755 --- a/schema.yaml +++ b/schema.yaml @@ -749,7 +749,7 @@ variables: use_compute_agent: type: boolean title: "use compute agent" - description: "Select if your image has the OCA agent rather than the oci-cn-auth package. The marketplace image does not use the compute agent for now." + description: "Select if your image has the OCA agent rather than the oci-cn-auth package. The new marketplace images need the compute agent enabled." default: true compute_image_compartment: From cb5897c4f76dd580f01eaba283693772382ff299 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 6 Dec 2023 17:20:33 -0800 Subject: [PATCH 55/68] added NCCL bash script for H100 and updated the mpirun parameters on the H100 NCCL sbatch script --- samples/gpu/nccl_run_allreduce_H100.sbatch | 29 +++++--- samples/gpu/nccl_run_allreduce_H100.sh | 87 ++++++++++++++++++++++ 2 files changed, 106 insertions(+), 10 deletions(-) create mode 100644 samples/gpu/nccl_run_allreduce_H100.sh diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch index 164751b8..f1652e81 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sbatch +++ b/samples/gpu/nccl_run_allreduce_H100.sbatch @@ -7,7 +7,7 @@ export PMI_DEBUG=1 -cd /nfs/scratch +cd /nfs/cluster mkdir $SLURM_JOB_ID cd $SLURM_JOB_ID @@ -30,8 +30,6 @@ if [[ "$mpivars_path" == "" ]]; then source $mpivars_path -export NCCL_DEBUG=WARN - shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` if [ $shape == \"BM.GPU.H100.8\" ] then @@ -43,17 +41,28 @@ fi mpirun --mca pml ucx \ --bind-to numa \ + -npernode 8 \ --mca coll ^hcoll \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_SOCKET_NTHREADS=16 \ -x NCCL_DEBUG=WARN \ - -x NCCL_IB_SL=0 \ - -x NCCL_IB_TC=41 \ + -x NCCL_CUMEM_ENABLE=0 \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ -x NCCL_IB_QPS_PER_CONNECTION=16 \ - -x UCX_TLS=tcp \ - -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x NCCL_NET_PLUGIN=none \ -x HCOLL_ENABLE_MCAST_ALL=0 \ -x coll_hcoll_enable=0 \ - -x NCCL_IB_GID_INDEX=3 \ - -x NCCL_ALGO=Auto \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x RX_QUEUE_LEN=8192 \ + -x IB_RX_QUEUE_LEN=8192 \ + -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_ALGO=auto \ + -x NCCL_IGNORE_CPU_AFFINITY=1 \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ -x NCCL_TOPO_FILE=~/H100-topology.xml \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file + --np $np --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh new file mode 100644 index 00000000..520f125d --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_H100.sh @@ -0,0 +1,87 @@ +#!/bin/bash +set -e + +# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter +max=$1 + +# This assume, the hostfile passed is already ordered based on their rackId +if [ -n "$2" ]; then + hostfile=$2 +else + hostfile="/etc/opt/oci-hpc/hostfile.tcp" +fi + +echo INPUTFILE +cat $hostfile + +# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used. +if [ -n "$3" ]; then + np=$3 +else + np=$((`less $hostfile | wc -l` * 8 )) +fi + +logfile="nccl_run_allreduce.sh.log" + +for x in $(seq 1 1 $max) +do + + echo $x + echo $x >> $logfile + date >> $logfile + + hostfile=$hostfile; np=$np ; iter=20; + + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + + if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` + fi + + if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + + source $mpivars_path + + first_node=`head $hostfile -n 1` + shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` + if [ $shape == \"BM.GPU.H100.8\" ] + then + var_UCX_NET_DEVICES=eth0 + var_NCCL_IB_HCA="=mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17" + else + echo "Use the appropriate nccl test run script for non H100 nodes" + fi + + mpirun --mca pml ucx \ + --bind-to numa \ + -npernode 8 \ + --mca coll ^hcoll \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_SOCKET_NTHREADS=16 \ + -x NCCL_DEBUG=WARN \ + -x NCCL_CUMEM_ENABLE=0 \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=16 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x NCCL_NET_PLUGIN=none \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x coll_hcoll_enable=0 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x RX_QUEUE_LEN=8192 \ + -x IB_RX_QUEUE_LEN=8192 \ + -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_ALGO=auto \ + -x NCCL_IGNORE_CPU_AFFINITY=1 \ + -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ + -x NCCL_TOPO_FILE=~/H100-topology.xml \ + --np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile + + tail -n 32 $logfile + + +done \ No newline at end of file From 76d17bc3730a2e07e2f932083f8b50917b0c85cf Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 6 Dec 2023 17:24:07 -0800 Subject: [PATCH 56/68] reverted the number of processes to be from slurm variables for the sbatch script --- samples/gpu/nccl_run_allreduce_H100.sbatch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch index f1652e81..830870c4 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sbatch +++ b/samples/gpu/nccl_run_allreduce_H100.sbatch @@ -65,4 +65,4 @@ fi -x NCCL_IGNORE_CPU_AFFINITY=1 \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ -x NCCL_TOPO_FILE=~/H100-topology.xml \ - --np $np --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 \ No newline at end of file From 96c33ffd358814379b3d1c15f35ec02f07885c5e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 7 Dec 2023 13:00:32 -0700 Subject: [PATCH 57/68] Fix custom memory of compuet nodes --- schema.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index 9008fb83..037e685b 100755 --- a/schema.yaml +++ b/schema.yaml @@ -702,7 +702,8 @@ variables: - "VM.Standard3.Flex" - and: - ${instance_pool_custom_memory} - - ${cluster_network} + - not: + - ${cluster_network} required: true node_count: From e023cd4c4add9c1b3ff37a349b8f3f2333a30b56 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 8 Dec 2023 16:42:27 -0700 Subject: [PATCH 58/68] Fix autoscaling Cron job --- playbooks/roles/cron/tasks/el.yml | 2 +- playbooks/roles/cron/tasks/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/cron/tasks/el.yml b/playbooks/roles/cron/tasks/el.yml index 0697325b..77434ba4 100755 --- a/playbooks/roles/cron/tasks/el.yml +++ b/playbooks/roles/cron/tasks/el.yml @@ -28,7 +28,7 @@ name: slurm autoscaling minute: "#*" user: '{{ ansible_user }}' - job: "/opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm_`date '+\\%Y\\%m\\%d'`.log 2>&1" + job: "#/opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm_`date '+\\%Y\\%m\\%d'`.log 2>&1" when: not autoscaling | bool diff --git a/playbooks/roles/cron/tasks/ubuntu.yml b/playbooks/roles/cron/tasks/ubuntu.yml index c69f9baf..834fef81 100644 --- a/playbooks/roles/cron/tasks/ubuntu.yml +++ b/playbooks/roles/cron/tasks/ubuntu.yml @@ -64,7 +64,7 @@ name: slurm autoscaling minute: "#*" user: '{{ ansible_user }}' - job: "/opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm_`date '+\\%Y\\%m\\%d'`.log 2>&1" + job: "#/opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm_`date '+\\%Y\\%m\\%d'`.log 2>&1" when: not autoscaling | bool - name: Create a commented Slurm monitoring cron file under /etc/cron.d From 9b598e1ef948f7cf68f1c2348c89031d28966e34 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 11 Dec 2023 14:14:47 -0700 Subject: [PATCH 59/68] Update to latest image --- variables.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/variables.tf b/variables.tf index c8b8afaf..ed689acf 100755 --- a/variables.tf +++ b/variables.tf @@ -91,10 +91,10 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-0" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-0" - "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-0" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-0" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-1" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-1" } } From 34b358742594168f41765ba4b026ad34bb53fd4b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 11 Dec 2023 14:15:20 -0700 Subject: [PATCH 60/68] Fix login and backup Add Block Volume --- autoscaling/tf_init/bastion_update.tf | 2 ++ autoscaling/tf_init/inventory.tpl | 2 ++ bastion.tf | 5 ++++- conf/variables.tpl | 2 ++ inventory.tpl | 2 ++ locals.tf | 2 ++ playbooks/site.yml | 23 ++++++++++++++++------- slurm_ha.tf | 5 ++++- 8 files changed, 34 insertions(+), 9 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index efba3245..07351310 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -56,6 +56,8 @@ resource "local_file" "inventory" { bastion_block = var.bastion_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = var.bastion_mount_ip, + backup_mount_ip = var.backup_mount_ip, + login_mount_ip = var.login_mount_ip, cluster_mount_ip = local.mount_ip, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 873ff0b0..7dffcf0b 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -43,6 +43,8 @@ spack = ${spack} bastion_block = ${bastion_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} +backup_mount_ip = ${backup_mount_ip} +login_mount_ip = ${login_mount_ip} cluster_mount_ip = ${cluster_mount_ip} autoscaling = true force = no diff --git a/bastion.tf b/bastion.tf index f6b65db4..7646bcc9 100644 --- a/bastion.tf +++ b/bastion.tf @@ -266,6 +266,8 @@ resource "null_resource" "cluster" { bastion_block = var.bastion_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, + backup_mount_ip = local.backup_mount_ip, + login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, cluster_name = local.cluster_name, @@ -390,6 +392,8 @@ resource "null_resource" "cluster" { bastion_block = var.bastion_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, + backup_mount_ip = local.backup_mount_ip, + login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, scratch_nfs_type_pool = var.scratch_nfs_type_pool, @@ -403,7 +407,6 @@ resource "null_resource" "cluster" { use_cluster_nfs = var.use_cluster_nfs, cluster_nfs_path = var.cluster_nfs_path, bastion_block = var.bastion_block, - bastion_mount_ip = local.bastion_mount_ip, home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, diff --git a/conf/variables.tpl b/conf/variables.tpl index 9e55fb2f..8d0823fe 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -101,6 +101,8 @@ variable "ssh_cidr" {default="${ssh_cidr}"} variable "bastion_block" {default = "${bastion_block}"} variable "bastion_mount_ip" {default = "${bastion_mount_ip}"} +variable "backup_mount_ip" {default = "${backup_mount_ip}"} +variable "login_mount_ip" {default = "${login_mount_ip}"} variable "home_nfs" { default = ${home_nfs} } variable "home_fss" { default = ${home_fss} } variable "latency_check" { default = ${latency_check} } diff --git a/inventory.tpl b/inventory.tpl index 2bfd85d0..c7c1c1cf 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -37,6 +37,8 @@ spack = ${spack} bastion_block = ${bastion_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} +backup_mount_ip = ${backup_mount_ip} +login_mount_ip = ${login_mount_ip} cluster_mount_ip = ${cluster_mount_ip} autoscaling = ${autoscaling} cluster_name = ${cluster_name} diff --git a/locals.tf b/locals.tf index 5866193e..ea008c84 100755 --- a/locals.tf +++ b/locals.tf @@ -44,6 +44,8 @@ locals { is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" + backup_mount_ip = var.bastion_block && var.slurm_ha ? element(concat(oci_core_volume_attachment.backup_volume_attachment.*.ipv4, [""]), 0) : "none" + login_mount_ip = var.login_block ? element(concat(oci_core_volume_attachment.login_volume_attachment.*.ipv4, [""]), 0) : "none" scratch_nfs_type = var.cluster_network ? var.scratch_nfs_type_cluster : var.scratch_nfs_type_pool diff --git a/playbooks/site.yml b/playbooks/site.yml index 975fdbbf..47b7ac22 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -104,6 +104,8 @@ options: "" lock: "none" when: add_nfs|bool and home_fss|bool + - include_role: + name: passwords - hosts: bastion @@ -163,9 +165,7 @@ export_name: "cluster" local_path: "/export/cluster" iscsi_ip: "{{ bastion_mount_ip }}" - tasks: - - include_role: - name: passwords + tasks: - include_role: name: iscsi when: bastion_block|default(false)|bool @@ -175,17 +175,26 @@ - include_role: name: mysql -- hosts: slurm_backup, login +- hosts: slurm_backup become: true vars: - iscsi_ip: "{{ bastion_mount_ip }}" + iscsi_ip: "{{ backup_mount_ip }}" + local_path: "{{cluster_nfs_path}}" tasks: - - include_role: - name: passwords - include_role: name: iscsi when: bastion_block|default(false)|bool +- hosts: login + become: true + vars: + iscsi_ip: "{{ login_mount_ip }}" + local_path: "{{cluster_nfs_path}}" + tasks: + - include_role: + name: iscsi + when: login_block|default(false)|bool + - hosts: nfs become: true vars: diff --git a/slurm_ha.tf b/slurm_ha.tf index 6db5a614..9aefd711 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -230,6 +230,8 @@ resource "null_resource" "cluster_backup" { bastion_block = var.bastion_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, + backup_mount_ip = local.backup_mount_ip, + login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, cluster_name = local.cluster_name, @@ -354,6 +356,8 @@ resource "null_resource" "cluster_backup" { bastion_block = var.bastion_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, + backup_mount_ip = local.backup_mount_ip, + login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, scratch_nfs_type_pool = var.scratch_nfs_type_pool, @@ -367,7 +371,6 @@ resource "null_resource" "cluster_backup" { use_cluster_nfs = var.use_cluster_nfs, cluster_nfs_path = var.cluster_nfs_path, bastion_block = var.bastion_block, - bastion_mount_ip = local.bastion_mount_ip, home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, From ce8813a21d21bbc0fe8d93f24ce869ad541395f3 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 11 Dec 2023 17:43:44 -0700 Subject: [PATCH 61/68] Add fixes for block volumes on login and backup --- autoscaling/tf_init/bastion_update.tf | 1 + autoscaling/tf_init/inventory.tpl | 1 + bastion.tf | 3 ++- conf/variables.tpl | 1 + inventory.tpl | 1 + playbooks/site.yml | 4 ++-- slurm_ha.tf | 3 ++- 7 files changed, 10 insertions(+), 4 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 07351310..b179e645 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -54,6 +54,7 @@ resource "local_file" "inventory" { spack = var.spack, ldap = var.ldap, bastion_block = var.bastion_block, + login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = var.bastion_mount_ip, backup_mount_ip = var.backup_mount_ip, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 7dffcf0b..397a7514 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -41,6 +41,7 @@ pyxis = ${pyxis} enroot = ${enroot} spack = ${spack} bastion_block = ${bastion_block} +login_block = ${login_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} backup_mount_ip = ${backup_mount_ip} diff --git a/bastion.tf b/bastion.tf index 7646bcc9..431bd4f3 100644 --- a/bastion.tf +++ b/bastion.tf @@ -264,6 +264,7 @@ resource "null_resource" "cluster" { spack = var.spack, ldap = var.ldap, bastion_block = var.bastion_block, + login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, backup_mount_ip = local.backup_mount_ip, @@ -390,6 +391,7 @@ resource "null_resource" "cluster" { spack = var.spack, ldap = var.ldap, bastion_block = var.bastion_block, + login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, backup_mount_ip = local.backup_mount_ip, @@ -406,7 +408,6 @@ resource "null_resource" "cluster" { ssh_cidr = var.ssh_cidr, use_cluster_nfs = var.use_cluster_nfs, cluster_nfs_path = var.cluster_nfs_path, - bastion_block = var.bastion_block, home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, diff --git a/conf/variables.tpl b/conf/variables.tpl index 8d0823fe..67db5415 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -99,6 +99,7 @@ variable "cluster_block_volume_performance" {default="${cluster_block_volume_per variable "ssh_cidr" {default="${ssh_cidr}"} variable "bastion_block" {default = "${bastion_block}"} +variable "login_block" {default = "${login_block}"} variable "bastion_mount_ip" {default = "${bastion_mount_ip}"} variable "backup_mount_ip" {default = "${backup_mount_ip}"} diff --git a/inventory.tpl b/inventory.tpl index c7c1c1cf..a7e8cc5b 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -35,6 +35,7 @@ slurm = ${slurm} rack_aware = ${rack_aware} spack = ${spack} bastion_block = ${bastion_block} +login_block = ${login_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} backup_mount_ip = ${backup_mount_ip} diff --git a/playbooks/site.yml b/playbooks/site.yml index 47b7ac22..54dd5b76 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -179,7 +179,7 @@ become: true vars: iscsi_ip: "{{ backup_mount_ip }}" - local_path: "{{cluster_nfs_path}}" + local_path: "/mnt/block" tasks: - include_role: name: iscsi @@ -189,7 +189,7 @@ become: true vars: iscsi_ip: "{{ login_mount_ip }}" - local_path: "{{cluster_nfs_path}}" + local_path: "/mnt/block" tasks: - include_role: name: iscsi diff --git a/slurm_ha.tf b/slurm_ha.tf index 9aefd711..2ea8f88a 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -228,6 +228,7 @@ resource "null_resource" "cluster_backup" { spack = var.spack, ldap = var.ldap, bastion_block = var.bastion_block, + login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, backup_mount_ip = local.backup_mount_ip, @@ -354,6 +355,7 @@ resource "null_resource" "cluster_backup" { spack = var.spack, ldap = var.ldap, bastion_block = var.bastion_block, + login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, backup_mount_ip = local.backup_mount_ip, @@ -370,7 +372,6 @@ resource "null_resource" "cluster_backup" { ssh_cidr = var.ssh_cidr, use_cluster_nfs = var.use_cluster_nfs, cluster_nfs_path = var.cluster_nfs_path, - bastion_block = var.bastion_block, home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, From 6744d33dde63c952903010967333b7d485fbf990 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 12 Dec 2023 18:44:20 -0700 Subject: [PATCH 62/68] Update images for create_cluster --- conf/variables.tpl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index 67db5415..9f31860f 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -53,10 +53,10 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" - "GPU_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" - "GPU_OL8" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" } } @@ -102,7 +102,6 @@ variable "bastion_block" {default = "${bastion_block}"} variable "login_block" {default = "${login_block}"} variable "bastion_mount_ip" {default = "${bastion_mount_ip}"} -variable "backup_mount_ip" {default = "${backup_mount_ip}"} variable "login_mount_ip" {default = "${login_mount_ip}"} variable "home_nfs" { default = ${home_nfs} } variable "home_fss" { default = ${home_fss} } From 4baea28aadc61117f982b67eba94a88fbf2e300d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 12 Dec 2023 18:44:47 -0700 Subject: [PATCH 63/68] Update backup to mount the same BV as bastion --- autoscaling/tf_init/bastion_update.tf | 1 - autoscaling/tf_init/inventory.tpl | 1 - bastion.tf | 3 +-- inventory.tpl | 1 - locals.tf | 1 - playbooks/roles/iscsi/tasks/el.yml | 1 + playbooks/site.yml | 4 ++-- slurm_ha.tf | 19 ++++--------------- variables.tf | 4 ++-- 9 files changed, 10 insertions(+), 25 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index b179e645..d4154c2e 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -57,7 +57,6 @@ resource "local_file" "inventory" { login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = var.bastion_mount_ip, - backup_mount_ip = var.backup_mount_ip, login_mount_ip = var.login_mount_ip, cluster_mount_ip = local.mount_ip, cluster_name = local.cluster_name, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 397a7514..146d5cce 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -44,7 +44,6 @@ bastion_block = ${bastion_block} login_block = ${login_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} -backup_mount_ip = ${backup_mount_ip} login_mount_ip = ${login_mount_ip} cluster_mount_ip = ${cluster_mount_ip} autoscaling = true diff --git a/bastion.tf b/bastion.tf index 431bd4f3..c8df39cb 100644 --- a/bastion.tf +++ b/bastion.tf @@ -15,6 +15,7 @@ resource "oci_core_volume_attachment" "bastion_volume_attachment" { instance_id = oci_core_instance.bastion.id display_name = "${local.cluster_name}-bastion-volume-attachment" device = "/dev/oracleoci/oraclevdb" + is_shareable = true } resource "oci_core_volume_backup_policy" "bastion_boot_volume_backup_policy" { @@ -267,7 +268,6 @@ resource "null_resource" "cluster" { login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, - backup_mount_ip = local.backup_mount_ip, login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, @@ -394,7 +394,6 @@ resource "null_resource" "cluster" { login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, - backup_mount_ip = local.backup_mount_ip, login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, diff --git a/inventory.tpl b/inventory.tpl index a7e8cc5b..1d1586c2 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -38,7 +38,6 @@ bastion_block = ${bastion_block} login_block = ${login_block} scratch_nfs_type = ${scratch_nfs_type} bastion_mount_ip = ${bastion_mount_ip} -backup_mount_ip = ${backup_mount_ip} login_mount_ip = ${login_mount_ip} cluster_mount_ip = ${cluster_mount_ip} autoscaling = ${autoscaling} diff --git a/locals.tf b/locals.tf index ea008c84..9c791ed3 100755 --- a/locals.tf +++ b/locals.tf @@ -44,7 +44,6 @@ locals { is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" - backup_mount_ip = var.bastion_block && var.slurm_ha ? element(concat(oci_core_volume_attachment.backup_volume_attachment.*.ipv4, [""]), 0) : "none" login_mount_ip = var.login_block ? element(concat(oci_core_volume_attachment.login_volume_attachment.*.ipv4, [""]), 0) : "none" scratch_nfs_type = var.cluster_network ? var.scratch_nfs_type_cluster : var.scratch_nfs_type_pool diff --git a/playbooks/roles/iscsi/tasks/el.yml b/playbooks/roles/iscsi/tasks/el.yml index 00f7c4ba..9fbe71bd 100755 --- a/playbooks/roles/iscsi/tasks/el.yml +++ b/playbooks/roles/iscsi/tasks/el.yml @@ -23,6 +23,7 @@ filesystem: dev: '{{ target["devicenodes"][0] }}' fstype: xfs + when: not ('slurm_backup' in group_names) - name: Mount local volume mount: diff --git a/playbooks/site.yml b/playbooks/site.yml index 54dd5b76..3bb6f837 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -178,8 +178,8 @@ - hosts: slurm_backup become: true vars: - iscsi_ip: "{{ backup_mount_ip }}" - local_path: "/mnt/block" + iscsi_ip: "{{ bastion_mount_ip }}" + local_path: "/mnt/nfs_backup" tasks: - include_role: name: iscsi diff --git a/slurm_ha.tf b/slurm_ha.tf index 2ea8f88a..bc3d04cd 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -1,20 +1,11 @@ -resource "oci_core_volume" "backup_volume" { - count = var.bastion_block && var.slurm_ha ? 1 : 0 - availability_domain = var.bastion_ad - compartment_id = var.targetCompartment - display_name = "${local.cluster_name}-backup-volume" - size_in_gbs = var.bastion_block_volume_size - vpus_per_gb = split(".", var.bastion_block_volume_performance)[0] -} - - resource "oci_core_volume_attachment" "backup_volume_attachment" { count = var.bastion_block && var.slurm_ha ? 1 : 0 attachment_type = "iscsi" - volume_id = oci_core_volume.backup_volume[0].id + volume_id = oci_core_volume.bastion_volume[0].id instance_id = oci_core_instance.backup[0].id display_name = "${local.cluster_name}-backup-volume-attachment" device = "/dev/oracleoci/oraclevdb" + is_shareable = true } resource "oci_core_instance" "backup" { @@ -60,7 +51,7 @@ resource "oci_core_instance" "backup" { resource "null_resource" "backup" { count = var.slurm_ha ? 1 : 0 - depends_on = [oci_core_instance.backup, oci_core_volume_attachment.backup_volume_attachment ] + depends_on = [oci_core_instance.backup] triggers = { backup = oci_core_instance.backup[0].id } @@ -186,7 +177,7 @@ resource "null_resource" "backup" { } resource "null_resource" "cluster_backup" { count = var.slurm_ha ? 1 : 0 - depends_on = [null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.backup, oci_core_volume_attachment.backup_volume_attachment ] + depends_on = [null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.backup ] triggers = { cluster_instances = join(", ", local.cluster_instances_names) } @@ -231,7 +222,6 @@ resource "null_resource" "cluster_backup" { login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, - backup_mount_ip = local.backup_mount_ip, login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, @@ -358,7 +348,6 @@ resource "null_resource" "cluster_backup" { login_block = var.login_block, scratch_nfs_type = local.scratch_nfs_type, bastion_mount_ip = local.bastion_mount_ip, - backup_mount_ip = local.backup_mount_ip, login_mount_ip = local.login_mount_ip, cluster_mount_ip = local.mount_ip, scratch_nfs_type_cluster = var.scratch_nfs_type_cluster, diff --git a/variables.tf b/variables.tf index ed689acf..b505993a 100755 --- a/variables.tf +++ b/variables.tf @@ -93,8 +93,8 @@ variable "marketplace_version_id" { "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" - "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-1" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-1" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" } } From 1ca15463af0bbd2b8e8ac58b557a75d980421027 Mon Sep 17 00:00:00 2001 From: anoopna Date: Wed, 3 Jan 2024 19:46:08 +0530 Subject: [PATCH 64/68] fixed error handling --- playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml | 2 +- playbooks/roles/destroy_unreachable/tasks/slurm.yml | 2 +- playbooks/roles/slurm/tasks/destroy-rack-aware.yml | 2 +- playbooks/roles/slurm/tasks/destroy.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index 4471c98c..afe0cd62 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -249,7 +249,7 @@ - name: change Node Status become: true command: "scontrol update nodename={{ item }} state=future reason=terminating" - ignore_errors: force + ignore_errors: true ignore_unreachable: True with_items: "{{unreachable_slurm_nodes}}" delegate_to: 127.0.0.1 diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm.yml b/playbooks/roles/destroy_unreachable/tasks/slurm.yml index e06e77a3..6249b5ca 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm.yml @@ -139,7 +139,7 @@ - name: change Node Status become: true command: "scontrol update nodename={{ item }} state=future reason=terminating" - ignore_errors: force + ignore_errors: true ignore_unreachable: True with_items: "{{unreachable_slurm_nodes}}" delegate_to: 127.0.0.1 diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index dc36daf7..7f1e8846 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -2,7 +2,7 @@ - name: change Node Status become: true command: "scontrol update nodename={{ ansible_hostname }} state=future reason=terminating" - ignore_errors: force + ignore_errors: true ignore_unreachable: True delegate_to: 127.0.0.1 diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index 406b3b5b..5c58d085 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -112,7 +112,7 @@ - name: change Node Status become: true command: "scontrol update nodename={{ ansible_hostname }} state=future reason=terminating" - ignore_errors: force + ignore_errors: true ignore_unreachable: True delegate_to: 127.0.0.1 From 4d417e636a847ea943de384d7add6ae23b81caf7 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 3 Jan 2024 18:46:11 -0700 Subject: [PATCH 65/68] Latest images --- conf/variables.tpl | 8 ++++---- variables.tf | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index 9f31860f..d7ebab10 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -53,10 +53,10 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" - "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" } } diff --git a/variables.tf b/variables.tf index b505993a..190be78c 100755 --- a/variables.tf +++ b/variables.tf @@ -91,10 +91,10 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2023.12.04-1" - "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2023.12.04-2" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" } } From 6d1faf325a73aa7026e165d843db02b6f8447ac5 Mon Sep 17 00:00:00 2001 From: Clifford Patterson Date: Fri, 5 Jan 2024 10:38:28 -0600 Subject: [PATCH 66/68] adds initramfs rebuild for ubuntu in localdisk --- playbooks/roles/localdisk/tasks/common.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/localdisk/tasks/common.yml b/playbooks/roles/localdisk/tasks/common.yml index 558ae7af..09ec1e5a 100755 --- a/playbooks/roles/localdisk/tasks/common.yml +++ b/playbooks/roles/localdisk/tasks/common.yml @@ -94,4 +94,8 @@ recurse: no when: - one_lv | bool - - lv_count.stdout == '0' \ No newline at end of file + - lv_count.stdout == '0' + +- name: rebuild initramfs if ubuntu + shell: update-initramfs -k all -u + when: ansible_facts['distribution'] == "Ubuntu" From 261e250d081abca39a501cf98f5e0f2398d704de Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 5 Jan 2024 09:59:05 -0700 Subject: [PATCH 67/68] Change to latest OL8 --- conf/variables.tpl | 4 ++-- variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index d7ebab10..96dd18d3 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -54,9 +54,9 @@ variable "marketplace_version_id" { "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-1" "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-1" } } diff --git a/variables.tf b/variables.tf index 190be78c..20c4d9ca 100755 --- a/variables.tf +++ b/variables.tf @@ -92,9 +92,9 @@ variable "marketplace_version_id" { "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-2024.01.02-1" "GPU_OL7" = "OracleLinux-7-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" - "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-0" + "GPU_OL8" = "OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-2024.01.02-1" } } From 5b6dd344b71568161ab819c92c8b2b9e2703f0a9 Mon Sep 17 00:00:00 2001 From: Bruno Garbaccio Date: Thu, 23 Nov 2023 09:20:51 +0100 Subject: [PATCH 68/68] update queue status in autoscale_slurm.sh Adding option "-r" in getJobs() to get all jobs in squeue in case of a job array --- autoscaling/crontab/autoscale_slurm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index bc5ce204..9882b44c 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -41,7 +41,7 @@ def getTopology(clusterName): # Get the list of Jobs in all states def getJobs(): # changing the position of Dependency as it is giving blank instead of null. to handle that, putting it at the end. - out = subprocess.Popen(['squeue','-O','STATE,JOBID,FEATURE:100,NUMNODES,Partition,UserName,Dependency'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) + out = subprocess.Popen(['squeue','-r','-O','STATE,JOBID,FEATURE:100,NUMNODES,Partition,UserName,Dependency'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) stdout,stderr = out.communicate() return stdout.split("\n")[1:] @@ -433,4 +433,4 @@ try: except Exception: traceback.print_exc() -os.remove(lockfile) \ No newline at end of file +os.remove(lockfile)