kubeflow · google-oss-robot · Aug 3, 2021 · Jul 28, 2021 · Jul 28, 2021 · kawych
diff --git a/examples/pi/intel-entrypoint.sh b/examples/pi/intel-entrypoint.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+set_intel_vars=/opt/intel/oneapi/setvars.sh
+if [ -f $set_intel_vars ]; then
+  source $set_intel_vars
+fi
+
+function resolve_host() {
+  host="$1"
+  check="nslookup $host"
+  max_retry=5
+  counter=0
+  backoff=0.1
+  until $check > /dev/null
+  do
+    if [ $counter -eq $max_retry ]; then
+      echo "Couldn't resolve $host"
+      return
+    fi
+    sleep $backoff
+    echo "Couldn't resolve $host... Retrying"
+    ((counter++))
+    backoff=$(echo - | awk "{print $backoff + $backoff}")
+  done
+  echo "Resolved $host"
+}
+
+if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then
+  resolve_host "$HOSTNAME"
+  cat /etc/mpi/hostfile | while read host
+  do
+    resolve_host $host
+  done
+fi
+
+exec "$@"
diff --git a/examples/pi/intel.Dockerfile b/examples/pi/intel.Dockerfile
@@ -0,0 +1,57 @@
+FROM bash AS downloader
+
+RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB -O key.PUB
+
+
+FROM debian:buster as base
+
+COPY --from=downloader key.PUB /tmp/key.PUB
+
+# Install Intel oneAPI keys.
+RUN apt update \
+		&& apt install -y --no-install-recommends gnupg2 ca-certificates \
+    && apt-key add /tmp/key.PUB \
+		&& rm /tmp/key.PUB \
+		&& echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list \
+		&& apt remove -y gnupg2 ca-certificates \
+		&& apt autoremove -y \
+		&& rm -rf /var/lib/apt/lists/*
+
+
+FROM base as builder
+
+RUN apt update \
+		&& apt install -y --no-install-recommends \
+			libstdc++-8-dev binutils \
+			intel-oneapi-compiler-dpcpp-cpp \
+			intel-oneapi-mpi-devel \
+		&& rm -rf /var/lib/apt/lists/*
+
+ENV I_MPI_CC=clang I_MPI_CXX=clang++
+COPY pi.cc /src/pi.cc
+RUN bash -c "source /opt/intel/oneapi/setvars.sh && mpicxx /src/pi.cc -o /pi"
+
+
+FROM base
+
+RUN apt update \
+		&& apt install -y --no-install-recommends \
+			openssh-server \
+			openssh-client \
+			dnsutils \
+			intel-oneapi-mpi \
+		&& rm -rf /var/lib/apt/lists/*
+
+# Add priviledge separation directoy to run sshd as root.
+RUN mkdir -p /var/run/sshd
+# Add capability to run sshd as non-root.
+RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
+
+RUN useradd -m mpiuser
+WORKDIR /home/mpiuser
+COPY intel-entrypoint.sh /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]
+COPY --chown=mpiuser sshd_config .sshd_config
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config
+
+COPY --from=builder /pi /home/mpiuser/pi
diff --git a/examples/pi/pi-intel.yaml b/examples/pi/pi-intel.yaml
@@ -0,0 +1,52 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: pi
+spec:
+  slotsPerWorker: 1
+  cleanPodPolicy: Running
+  sshAuthMountPath: /home/mpiuser/.ssh
+  mpiImplementation: Intel
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          - image: docker.io/kubeflow/mpi-pi:intel
+            imagePullPolicy: Always
+            name: mpi-launcher
+            securityContext:
+              runAsUser: 1000
+            args:
+            - mpirun
+            - -n
+            - "2"
+            - /home/mpiuser/pi
+            resources:
+              limits:
+                cpu: 1
+                memory: 1Gi
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          containers:
+          - image: docker.io/kubeflow/mpi-pi:intel
+            imagePullPolicy: Always
+            name: mpi-worker
+            securityContext:
+              runAsUser: 1000
+              capabilities:
+                add:
+                - NET_BIND_SERVICE
+            command:
+            args:
+            - /usr/sbin/sshd
+            - -De
+            - -f
+            - /home/mpiuser/.sshd_config
+            resources:
+              limits:
+                cpu: 1
+                memory: 1Gi
diff --git a/examples/pi/pi.yaml b/examples/pi/pi.yaml
@@ -19,13 +19,13 @@ spec:
             command:
             - mpirun
             args:
-            - -np
+            - -n
             - "2"
             - /home/mpiuser/pi
             resources:
               limits:
                 cpu: 1
-                memory: 2Gi
+                memory: 1Gi
     Worker:
       replicas: 2
       template:
@@ -46,5 +46,5 @@ spec:
             - /home/mpiuser/.sshd_config
             resources:
               limits:
-                cpu: 2
-                memory: 4Gi
+                cpu: 1
+                memory: 1Gi
diff --git a/manifests/base/crd.yaml b/manifests/base/crd.yaml
@@ -106,6 +106,9 @@ spec:
                 description: "Defines which Pods must be deleted after the Job completes"
               sshAuthMountPath:
                 type: string
+              mpiImplementation:
+                type: string
+                enum: ["OpenMPI", "Intel"]
               mpiReplicaSpecs:
                 type: object
                 properties:

diff --git a/v2/pkg/apis/kubeflow/v2beta1/default.go b/v2/pkg/apis/kubeflow/v2beta1/default.go
@@ -59,6 +59,9 @@ func SetDefaults_MPIJob(mpiJob *MPIJob) {
 	if mpiJob.Spec.SSHAuthMountPath == "" {
 		mpiJob.Spec.SSHAuthMountPath = "/root/.ssh"
 	}
+	if mpiJob.Spec.MPIImplementation == "" {
+		mpiJob.Spec.MPIImplementation = MPIImplementationOpenMPI
+	}
 
 	// set default to Launcher
 	setDefaultsTypeLauncher(mpiJob.Spec.MPIReplicaSpecs[MPIReplicaTypeLauncher])

diff --git a/v2/pkg/apis/kubeflow/v2beta1/default_test.go b/v2/pkg/apis/kubeflow/v2beta1/default_test.go
@@ -29,25 +29,28 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 		"base defaults": {
 			want: MPIJob{
 				Spec: MPIJobSpec{
-					SlotsPerWorker:   newInt32(1),
-					CleanPodPolicy:   newCleanPodPolicy(common.CleanPodPolicyNone),
-					SSHAuthMountPath: "/root/.ssh",
+					SlotsPerWorker:    newInt32(1),
+					CleanPodPolicy:    newCleanPodPolicy(common.CleanPodPolicyNone),
+					SSHAuthMountPath:  "/root/.ssh",
+					MPIImplementation: MPIImplementationOpenMPI,
 				},
 			},
 		},
 		"base defaults overridden": {
 			job: MPIJob{
 				Spec: MPIJobSpec{
-					SlotsPerWorker:   newInt32(10),
-					CleanPodPolicy:   newCleanPodPolicy(common.CleanPodPolicyRunning),
-					SSHAuthMountPath: "/home/mpiuser/.ssh",
+					SlotsPerWorker:    newInt32(10),
+					CleanPodPolicy:    newCleanPodPolicy(common.CleanPodPolicyRunning),
+					SSHAuthMountPath:  "/home/mpiuser/.ssh",
+					MPIImplementation: MPIImplementationIntel,
 				},
 			},
 			want: MPIJob{
 				Spec: MPIJobSpec{
-					SlotsPerWorker:   newInt32(10),
-					CleanPodPolicy:   newCleanPodPolicy(common.CleanPodPolicyRunning),
-					SSHAuthMountPath: "/home/mpiuser/.ssh",
+					SlotsPerWorker:    newInt32(10),
+					CleanPodPolicy:    newCleanPodPolicy(common.CleanPodPolicyRunning),
+					SSHAuthMountPath:  "/home/mpiuser/.ssh",
+					MPIImplementation: MPIImplementationIntel,
 				},
 			},
 		},
@@ -61,9 +64,10 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 			},
 			want: MPIJob{
 				Spec: MPIJobSpec{
-					SlotsPerWorker:   newInt32(1),
-					CleanPodPolicy:   newCleanPodPolicy(common.CleanPodPolicyNone),
-					SSHAuthMountPath: "/root/.ssh",
+					SlotsPerWorker:    newInt32(1),
+					CleanPodPolicy:    newCleanPodPolicy(common.CleanPodPolicyNone),
+					SSHAuthMountPath:  "/root/.ssh",
+					MPIImplementation: MPIImplementationOpenMPI,
 					MPIReplicaSpecs: map[MPIReplicaType]*common.ReplicaSpec{
 						MPIReplicaTypeLauncher: {
 							Replicas:      newInt32(1),
@@ -83,9 +87,10 @@ func TestSetDefaults_MPIJob(t *testing.T) {
 			},
 			want: MPIJob{
 				Spec: MPIJobSpec{
-					SlotsPerWorker:   newInt32(1),
-					CleanPodPolicy:   newCleanPodPolicy(common.CleanPodPolicyNone),
-					SSHAuthMountPath: "/root/.ssh",
+					SlotsPerWorker:    newInt32(1),
+					CleanPodPolicy:    newCleanPodPolicy(common.CleanPodPolicyNone),
+					SSHAuthMountPath:  "/root/.ssh",
+					MPIImplementation: MPIImplementationOpenMPI,
 					MPIReplicaSpecs: map[MPIReplicaType]*common.ReplicaSpec{
 						MPIReplicaTypeWorker: {
 							Replicas:      newInt32(0),

diff --git a/v2/pkg/apis/kubeflow/v2beta1/types.go b/v2/pkg/apis/kubeflow/v2beta1/types.go
@@ -55,6 +55,10 @@ type MPIJobSpec struct {
 	// SSHAuthMountPath is the directory where SSH keys are mounted.
 	// Defaults to "/root/.ssh".
 	SSHAuthMountPath string `json:"sshAuthMountPath,omitempty"`
+
+	// MPIImplementation is the MPI implementation.
+	// Options are "OpenMPI" (default) and "Intel".
+	MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
 }
 
 // MPIReplicaType is the type for MPIReplica.
@@ -67,3 +71,10 @@ const (
 	// MPIReplicaTypeWorker is the type for worker replicas.
 	MPIReplicaTypeWorker MPIReplicaType = "Worker"
 )
+
+type MPIImplementation string
+
+const (
+	MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
+	MPIImplementationIntel   MPIImplementation = "Intel"
+)
diff --git a/v2/pkg/apis/kubeflow/validation/validation.go b/v2/pkg/apis/kubeflow/validation/validation.go
@@ -27,10 +27,16 @@ import (
 	kubeflow "github.com/kubeflow/mpi-operator/v2/pkg/apis/kubeflow/v2beta1"
 )
 
-var validCleanPolicies = sets.NewString(
-	string(common.CleanPodPolicyNone),
-	string(common.CleanPodPolicyRunning),
-	string(common.CleanPodPolicyAll))
+var (
+	validCleanPolicies = sets.NewString(
+		string(common.CleanPodPolicyNone),
+		string(common.CleanPodPolicyRunning),
+		string(common.CleanPodPolicyAll))
+
+	validMPIImplementations = sets.NewString(
+		string(kubeflow.MPIImplementationOpenMPI),
+		string(kubeflow.MPIImplementationIntel))
+)
 
 func ValidateMPIJob(job *kubeflow.MPIJob) field.ErrorList {
 	errs := validateMPIJobName(job)
@@ -68,6 +74,9 @@ func validateMPIJobSpec(spec *kubeflow.MPIJobSpec, path *field.Path) field.Error
 	if spec.SSHAuthMountPath == "" {
 		errs = append(errs, field.Required(path.Child("sshAuthMountPath"), "must have a mount path for SSH credentials"))
 	}
+	if !validMPIImplementations.Has(string(spec.MPIImplementation)) {
+		errs = append(errs, field.NotSupported(path.Child("mpiImplementation"), spec.MPIImplementation, validMPIImplementations.List()))
+	}
 	return errs
 }