diff --git a/examples/pi/intel-entrypoint.sh b/examples/pi/intel-entrypoint.sh new file mode 100755 index 000000000..00033cf81 --- /dev/null +++ b/examples/pi/intel-entrypoint.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set_intel_vars=/opt/intel/oneapi/setvars.sh +if [ -f $set_intel_vars ]; then + source $set_intel_vars +fi + +function resolve_host() { + host="$1" + check="nslookup $host" + max_retry=5 + counter=0 + backoff=0.1 + until $check > /dev/null + do + if [ $counter -eq $max_retry ]; then + echo "Couldn't resolve $host" + return + fi + sleep $backoff + echo "Couldn't resolve $host... Retrying" + ((counter++)) + backoff=$(echo - | awk "{print $backoff + $backoff}") + done + echo "Resolved $host" +} + +if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then + resolve_host "$HOSTNAME" + cat /etc/mpi/hostfile | while read host + do + resolve_host $host + done +fi + +exec "$@" \ No newline at end of file diff --git a/examples/pi/intel.Dockerfile b/examples/pi/intel.Dockerfile new file mode 100644 index 000000000..26e05f0c5 --- /dev/null +++ b/examples/pi/intel.Dockerfile @@ -0,0 +1,45 @@ +FROM bash AS downloader + +RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB -O key.PUB + + +FROM debian:buster + +COPY --from=downloader key.PUB /tmp/key.PUB + +# Install Intel oneAPI keys. +RUN apt update \ + && apt install -y --no-install-recommends gnupg2 ca-certificates \ + && apt-key add /tmp/key.PUB \ + && rm /tmp/key.PUB \ + && echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list \ + && apt remove -y gnupg2 ca-certificates \ + && apt autoremove -y \ + && rm -rf /var/lib/apt/lists/* + +RUN apt update \ + && apt install -y --no-install-recommends \ + openssh-server \ + openssh-client \ + dnsutils \ + libstdc++-8-dev binutils \ + intel-oneapi-compiler-dpcpp-cpp \ + intel-oneapi-mpi \ + intel-oneapi-mpi-devel \ + && rm -rf /var/lib/apt/lists/* + +# Add priviledge separation directoy to run sshd as root. +RUN mkdir -p /var/run/sshd +# Add capability to run sshd as non-root. +RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd + +RUN useradd -m mpiuser +WORKDIR /home/mpiuser +COPY intel-entrypoint.sh /entrypoint.sh +ENTRYPOINT ["/entrypoint.sh"] +COPY --chown=mpiuser sshd_config .sshd_config +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config + +ENV I_MPI_CC=clang I_MPI_CXX=clang++ +COPY pi.cc /src/pi.cc +RUN bash -c "source /opt/intel/oneapi/setvars.sh && mpicxx /src/pi.cc -o /home/mpiuser/pi" \ No newline at end of file diff --git a/examples/pi/pi-intel.yaml b/examples/pi/pi-intel.yaml new file mode 100644 index 000000000..070a24c5b --- /dev/null +++ b/examples/pi/pi-intel.yaml @@ -0,0 +1,52 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: pi +spec: + slotsPerWorker: 1 + cleanPodPolicy: Running + sshAuthMountPath: /home/mpiuser/.ssh + mpiImplementation: Intel + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: docker.io/kubeflow/mpi-pi:intel + imagePullPolicy: Always + name: mpi-launcher + securityContext: + runAsUser: 1000 + args: + - mpirun + - -n + - "2" + - /home/mpiuser/pi + resources: + limits: + cpu: 1 + memory: 1Gi + Worker: + replicas: 2 + template: + spec: + containers: + - image: docker.io/kubeflow/mpi-pi:intel + imagePullPolicy: Always + name: mpi-worker + securityContext: + runAsUser: 1000 + capabilities: + add: + - NET_BIND_SERVICE + command: + args: + - /usr/sbin/sshd + - -De + - -f + - /home/mpiuser/.sshd_config + resources: + limits: + cpu: 1 + memory: 1Gi diff --git a/examples/pi/pi.yaml b/examples/pi/pi.yaml index 3e9c98ef0..819d3678e 100644 --- a/examples/pi/pi.yaml +++ b/examples/pi/pi.yaml @@ -19,13 +19,13 @@ spec: command: - mpirun args: - - -np + - -n - "2" - /home/mpiuser/pi resources: limits: cpu: 1 - memory: 2Gi + memory: 1Gi Worker: replicas: 2 template: @@ -46,5 +46,5 @@ spec: - /home/mpiuser/.sshd_config resources: limits: - cpu: 2 - memory: 4Gi + cpu: 1 + memory: 1Gi