Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaced mounted kubeconfig with service account #30

Merged
merged 11 commits into from
Aug 17, 2023
3 changes: 0 additions & 3 deletions generate-secrets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,3 @@ kubectl create secret generic munge-key-secret \
--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \
-o yaml | \
kubectl apply -f -

cp $KUBECONFIG slurm-cluster-chart/files/kubeconfig
echo "copied $KUBECONFIG into slurm-cluster-chart/files/"
6 changes: 3 additions & 3 deletions image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker-
ARG SLURM_TAG=slurm-23.02
ARG GOSU_VERSION=1.11

COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo

RUN set -ex \
&& yum makecache \
&& yum -y update \
Expand Down Expand Up @@ -42,6 +44,7 @@ RUN set -ex \
hwloc-devel \
openssh-server \
apptainer \
kubectl \
&& yum clean all \
&& rm -rf /var/cache/yum

Expand Down Expand Up @@ -91,9 +94,6 @@ RUN mkdir /etc/sysconfig/slurm \
&& useradd -u 1000 rocky \
&& usermod -p '*' rocky # unlocks account but sets no password

COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo
RUN dnf install -y kubectl

VOLUME /etc/slurm
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/
Expand Down
4 changes: 0 additions & 4 deletions image/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,6 @@ then
echo "---> Setting ownership for state directory ..."
chown slurm:slurm /var/spool/slurmctld

echo "---> Copying Kubeconfig ..."
install -o slurm -g slurm -m u=rwX,go= -d /var/lib/slurmctld/
install -o slurm -g slurm -m u=r,go= /tmp/kubeconfig /var/lib/slurmctld/

echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
exec gosu slurm /usr/sbin/slurmctld -D "${@:2}"
Expand Down
13 changes: 9 additions & 4 deletions image/k8s-slurmd-create
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/usr/bin/bash

export KUBECONFIG=/var/lib/slurmctld/kubeconfig
echo "$(date) Resume invoked $0 $*" &>> /var/log/slurm/power_save.log

echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log
APISERVER=https://kubernetes.default.svc
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
TOKEN=$(cat ${SERVICEACCOUNT}/token)
CACERT=${SERVICEACCOUNT}/ca.crt

hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes
for host in $hosts
do
sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f -
done
( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | \
kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT create -f - )
done
10 changes: 7 additions & 3 deletions image/k8s-slurmd-delete
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/usr/bin/bash

export KUBECONFIG=/var/lib/slurmctld/kubeconfig

echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log

APISERVER=https://kubernetes.default.svc
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
TOKEN=$(cat ${SERVICEACCOUNT}/token)
CACERT=${SERVICEACCOUNT}/ca.crt

hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes
for host in $hosts
do
kubectl delete pod $host
kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT delete pod $host
done
8 changes: 0 additions & 8 deletions slurm-cluster-chart/templates/kubeconfig.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slurm-autoscaler-account
automountServiceAccountToken: True

---

apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: slurm-autoscaler-role
rules:
- apiGroups: [""] # "" indicates the core API group
resources: ["pods"]
verbs: ["get","apply","create", "patch", "delete", "list", "watch"]

---

apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: slurm-autoscaler-rolebinding
subjects:
- kind: ServiceAccount
name: slurm-autoscaler-account
roleRef:
kind: Role
name: slurm-autoscaler-role
apiGroup: rbac.authorization.k8s.io
8 changes: 1 addition & 7 deletions slurm-cluster-chart/templates/slurmctld-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ spec:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: slurmctld
spec:
serviceAccountName: slurm-autoscaler-account
containers:
- args:
- slurmctld
Expand All @@ -38,9 +39,6 @@ spec:
subPath: munge.key
- mountPath: /var/spool/slurmctld
name: slurmctld-state
- mountPath: /tmp/kubeconfig
name: kubeconfig-secret
subPath: kubeconfig
dnsConfig:
searches:
- slurmd.default.svc.cluster.local
Expand All @@ -63,7 +61,3 @@ spec:
secret:
secretName: {{ .Values.secrets.mungeKey }}
defaultMode: 0400
- name: kubeconfig-secret
secret:
secretName: kubeconfig-secret
defaultMode: 0400
2 changes: 1 addition & 1 deletion slurm-cluster-chart/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:9e4598e
slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:a731c60

replicas:
slurmd: 2
Expand Down