diff --git a/pkg/controller.v1/mpi/mpijob.go b/pkg/controller.v1/mpi/mpijob.go index a6644831ff..f51f68d9a7 100644 --- a/pkg/controller.v1/mpi/mpijob.go +++ b/pkg/controller.v1/mpi/mpijob.go @@ -45,6 +45,7 @@ const ( initContainerCpu = "100m" initContainerEphStorage = "5Gi" initContainerMem = "512Mi" + iMPIDefaultBootstrap = "rsh" ) const ( @@ -218,6 +219,26 @@ func isGPULauncher(mpiJob *kubeflowv1.MPIJob) bool { return false } +// hasIntelMPIBootstrapValues returns the existence of I_MPI_HYDRA_BOOTSTRAP +// and I_MPI_HYDRA_BOOTSTRAP_EXEC values. +// There are also _EXEC_EXTRA_ARGS and _AUTOFORK under the I_MPI_HYDRA_BOOTSTRAP +// prefix but those are not checked on purpose. +func hasIntelMPIBootstrapValues(envs []corev1.EnvVar) (bootstrap, exec bool) { + for _, env := range envs { + if env.Name == "I_MPI_HYDRA_BOOTSTRAP" { + bootstrap = true + } else if env.Name == "I_MPI_HYDRA_BOOTSTRAP_EXEC" { + exec = true + } + + if bootstrap && exec { + break + } + } + + return bootstrap, exec +} + func defaultReplicaLabels(genericLabels map[string]string, roleLabelVal string) map[string]string { replicaLabels := map[string]string{} for k, v := range genericLabels { diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index a1a95067fd..6108c644ec 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -1152,6 +1152,25 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive }) } + // Add default Intel MPI bootstrap variables if not provided by the user. + bootstrap, exec := hasIntelMPIBootstrapValues(container.Env) + if !bootstrap { + container.Env = append(container.Env, + corev1.EnvVar{ + Name: "I_MPI_HYDRA_BOOTSTRAP", + Value: iMPIDefaultBootstrap, + }, + ) + } + if !exec { + container.Env = append(container.Env, + corev1.EnvVar{ + Name: "I_MPI_HYDRA_BOOTSTRAP_EXEC", + Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), + }, + ) + } + container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{ Name: kubectlVolumeName, diff --git a/pkg/controller.v1/mpi/mpijob_controller_test.go b/pkg/controller.v1/mpi/mpijob_controller_test.go index 6500e67258..bf867deb6d 100644 --- a/pkg/controller.v1/mpi/mpijob_controller_test.go +++ b/pkg/controller.v1/mpi/mpijob_controller_test.go @@ -709,6 +709,91 @@ var _ = Describe("MPIJob controller", func() { }) }) + Context("Test launcher's Intel MPI handling", func() { + It("Should create a launcher job with Intel MPI env variables", func() { + By("By creating MPIJobs with and without preset env variables") + + testCases := map[string]struct { + envVariables map[string]string + expectedEnvVariables map[string]string + }{ + "withoutIMPIValues": { + envVariables: map[string]string{ + "X_MPI_HYDRA_BOOTSTRAP": "foo", + }, + expectedEnvVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap, + "I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), + }, + }, + "withIMPIBootstrap": { + envVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": "RSH", + }, + expectedEnvVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": "RSH", + "I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), + }, + }, + "withIMPIBootstrapExec": { + envVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", + }, + expectedEnvVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap, + "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", + }, + }, + "withIMPIBootstrapAndExec": { + envVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": "RSH", + "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", + }, + expectedEnvVariables: map[string]string{ + "I_MPI_HYDRA_BOOTSTRAP": "RSH", + "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", + }, + }, + } + + for testName, testCase := range testCases { + ctx := context.Background() + startTime := metav1.Now() + completionTime := metav1.Now() + + jobName := "test-launcher-creation-" + strings.ToLower(testName) + + mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime) + Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) + + template := &mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template + Expect(len(template.Spec.Containers) == 1).To(BeTrue()) + + cont := &template.Spec.Containers[0] + + for k, v := range testCase.envVariables { + cont.Env = append(cont.Env, + corev1.EnvVar{ + Name: k, + Value: v, + }, + ) + } + + launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", false) + + Expect(len(launcher.Spec.Containers) == 1).To(BeTrue()) + for expectedKey, expectedValue := range testCase.expectedEnvVariables { + Expect(launcher.Spec.Containers[0].Env).Should(ContainElements( + corev1.EnvVar{ + Name: expectedKey, + Value: expectedValue, + }), + ) + } + } + }) + }) }) func ReplicaStatusMatch(replicaStatuses map[common.ReplicaType]*common.ReplicaStatus,