From 8d43989dee5625ffa9a9bfb021b4b68e536a205d Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Tue, 9 May 2023 11:03:34 +0300
Subject: [PATCH] Add default Intel MPI env variables to MPIJob

Co-authored-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 pkg/controller.v1/mpi/mpijob.go               | 21 +++++
 pkg/controller.v1/mpi/mpijob_controller.go    | 19 +++++
 .../mpi/mpijob_controller_test.go             | 85 +++++++++++++++++++
 3 files changed, 125 insertions(+)

diff --git a/pkg/controller.v1/mpi/mpijob.go b/pkg/controller.v1/mpi/mpijob.go
index 36dae7a1d8..98c3b574f3 100644
--- a/pkg/controller.v1/mpi/mpijob.go
+++ b/pkg/controller.v1/mpi/mpijob.go
@@ -45,6 +45,7 @@ const (
 	initContainerCpu        = "100m"
 	initContainerEphStorage = "5Gi"
 	initContainerMem        = "512Mi"
+	iMPIDefaultBootstrap    = "rsh"
 )
 
 const (
@@ -218,6 +219,26 @@ func isGPULauncher(mpiJob *kubeflowv1.MPIJob) bool {
 	return false
 }
 
+// hasIntelMPIBootstrapValues returns the existence of I_MPI_HYDRA_BOOTSTRAP
+// and I_MPI_HYDRA_BOOTSTRAP_EXEC values.
+// There are also _EXEC_EXTRA_ARGS and _AUTOFORK under the I_MPI_HYDRA_BOOTSTRAP
+// prefix but those are not checked on purpose.
+func hasIntelMPIBootstrapValues(envs []corev1.EnvVar) (bootstrap, exec bool) {
+	for _, env := range envs {
+		if env.Name == "I_MPI_HYDRA_BOOTSTRAP" {
+			bootstrap = true
+		} else if env.Name == "I_MPI_HYDRA_BOOTSTRAP_EXEC" {
+			exec = true
+		}
+
+		if bootstrap && exec {
+			break
+		}
+	}
+
+	return bootstrap, exec
+}
+
 func defaultReplicaLabels(genericLabels map[string]string, roleLabelVal string) map[string]string {
 	replicaLabels := map[string]string{}
 	for k, v := range genericLabels {
diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go
index 2af6b72565..e8fd6bd835 100644
--- a/pkg/controller.v1/mpi/mpijob_controller.go
+++ b/pkg/controller.v1/mpi/mpijob_controller.go
@@ -1152,6 +1152,25 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive
 			})
 	}
 
+	// Add default Intel MPI bootstrap variables if not provided by the user.
+	bootstrap, exec := hasIntelMPIBootstrapValues(container.Env)
+	if !bootstrap {
+		container.Env = append(container.Env,
+			corev1.EnvVar{
+				Name:  "I_MPI_HYDRA_BOOTSTRAP",
+				Value: iMPIDefaultBootstrap,
+			},
+		)
+	}
+	if !exec {
+		container.Env = append(container.Env,
+			corev1.EnvVar{
+				Name:  "I_MPI_HYDRA_BOOTSTRAP_EXEC",
+				Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
+			},
+		)
+	}
+
 	container.VolumeMounts = append(container.VolumeMounts,
 		corev1.VolumeMount{
 			Name:      kubectlVolumeName,
diff --git a/pkg/controller.v1/mpi/mpijob_controller_test.go b/pkg/controller.v1/mpi/mpijob_controller_test.go
index 651a8e6879..a7ddac2cdc 100644
--- a/pkg/controller.v1/mpi/mpijob_controller_test.go
+++ b/pkg/controller.v1/mpi/mpijob_controller_test.go
@@ -709,6 +709,91 @@ var _ = Describe("MPIJob controller", func() {
 		})
 	})
 
+	Context("Test launcher's Intel MPI handling", func() {
+		It("Should create a launcher job with Intel MPI env variables", func() {
+			By("By creating MPIJobs with and without preset env variables")
+
+			testCases := map[string]struct {
+				envVariables         map[string]string
+				expectedEnvVariables map[string]string
+			}{
+				"withoutIMPIValues": {
+					envVariables: map[string]string{
+						"X_MPI_HYDRA_BOOTSTRAP": "foo",
+					},
+					expectedEnvVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP":      iMPIDefaultBootstrap,
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
+					},
+				},
+				"withIMPIBootstrap": {
+					envVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP": "RSH",
+					},
+					expectedEnvVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
+					},
+				},
+				"withIMPIBootstrapExec": {
+					envVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
+					},
+					expectedEnvVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP":      iMPIDefaultBootstrap,
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
+					},
+				},
+				"withIMPIBootstrapAndExec": {
+					envVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
+					},
+					expectedEnvVariables: map[string]string{
+						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
+						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
+					},
+				},
+			}
+
+			for testName, testCase := range testCases {
+				ctx := context.Background()
+				startTime := metav1.Now()
+				completionTime := metav1.Now()
+
+				jobName := "test-launcher-creation-" + strings.ToLower(testName)
+
+				mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime)
+				Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
+
+				template := &mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template
+				Expect(len(template.Spec.Containers) == 1).To(BeTrue())
+
+				cont := &template.Spec.Containers[0]
+
+				for k, v := range testCase.envVariables {
+					cont.Env = append(cont.Env,
+						corev1.EnvVar{
+							Name:  k,
+							Value: v,
+						},
+					)
+				}
+
+				launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", false)
+
+				Expect(len(launcher.Spec.Containers) == 1).To(BeTrue())
+				for expectedKey, expectedValue := range testCase.expectedEnvVariables {
+					Expect(launcher.Spec.Containers[0].Env).Should(ContainElements(
+						corev1.EnvVar{
+							Name:  expectedKey,
+							Value: expectedValue,
+						}),
+					)
+				}
+			}
+		})
+	})
 })
 
 func ReplicaStatusMatch(replicaStatuses map[common.ReplicaType]*common.ReplicaStatus,