From 5e11faec9dd8e74b9ef86884212e480de4b72110 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 20 Jul 2023 14:53:41 +0000 Subject: [PATCH 1/4] use dynamic normal nodes --- docker-entrypoint.sh | 2 +- slurm-cluster-chart/files/slurm.conf | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 23ad303..467fb9d 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -78,7 +78,7 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd -F -Dvvv + exec /usr/sbin/slurmd -Z -Dvvv fi if [ "$1" = "login" ] diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 4c072a7..711ce71 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -52,7 +52,6 @@ CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE # PARTITIONS PartitionName=all Default=yes Nodes=ALL From 556cc99e0b6fba3f40290ec87995a38376bf066f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 20 Jul 2023 14:54:27 +0000 Subject: [PATCH 2/4] don't use DNS for slurmd addresses --- slurm-cluster-chart/files/slurm.conf | 3 --- 1 file changed, 3 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 711ce71..5c9dc4f 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -47,9 +47,6 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_dns,cloud_reg_addrs -CommunicationParameters=NoAddrCache - # NODES MaxNodeCount=10 From 104546b276d4b7a67c13fd792cbaa06468d985ad Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 20 Jul 2023 14:54:54 +0000 Subject: [PATCH 3/4] make slurmd pods daemonsets with host networking --- .../templates/slurmd-deployment.yaml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index e973e3b..7b11ef2 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -1,5 +1,5 @@ apiVersion: apps/v1 -kind: StatefulSet +kind: DaemonSet metadata: creationTimestamp: null labels: @@ -7,12 +7,10 @@ metadata: app.kubernetes.io/component: slurmd name: slurmd spec: - replicas: {{ .Values.replicas.slurmd }} selector: matchLabels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - serviceName: slurmd template: metadata: creationTimestamp: null @@ -20,14 +18,6 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd containers: - args: - slurmd @@ -47,6 +37,8 @@ spec: subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet dnsConfig: searches: - slurmd.default.svc.cluster.local From f08d4c4d3c61877b4085843952a4e51577e0d99b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 20 Jul 2023 15:10:57 +0000 Subject: [PATCH 4/4] use dynamic normal node image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index cd9d34d..56f4da4 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:104546b replicas: slurmd: 2