-
Notifications
You must be signed in to change notification settings - Fork 220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add support for using Intel MPI(2019.7) and MVAPICH2 #283
Changes from 3 commits
642cb02
b3c5dcc
2be7003
603ec8d
35c16a5
d2dad84
a996d60
d2af63a
a7a3d61
af47f8f
03d687f
ba72818
50aaf5a
aa89133
e4bfa60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,11 @@ type MPIJobSpec struct { | |
// active. The policies specified in `RunPolicy` take precedence over | ||
// the following fields: `BackoffLimit` and `ActiveDeadlineSeconds`. | ||
RunPolicy *common.RunPolicy `json:"runPolicy,omitempty"` | ||
|
||
// MPIDistribution specifies name of the mpi framwork which is used | ||
// Deafults to "OpenMPI" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Defaults |
||
// Option includes "OpenMPI", "IntelMPI" and "MPICH" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Option -> Options Perhaps make them all lowercase and separate words with underscores? |
||
MPIDistribution string `json:"mpiDistribution,omitempty"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we define it as a type instead of a string? (PS, personally, prefer OpenMPI, IntelMPI and MPICH here) type MPIDistribution string
const (
MPIDistributionOpenMPI = "openMPI"
...
) |
||
} | ||
|
||
// MPIReplicaType is the type for MPIReplica. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -958,11 +958,27 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error { | |
// resource. It also sets the appropriate OwnerReferences on the resource so | ||
// handleObject can discover the MPIJob resource that 'owns' it. | ||
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigMap { | ||
kubexec := fmt.Sprintf(`#!/bin/sh | ||
var kubexec string | ||
if mpiJob.Spec.MPIDistribution == "IntelMPI"{ | ||
kubexec = fmt.Sprintf(`#!/bin/sh | ||
set -x | ||
POD_NAME=$3 | ||
shift 3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add some notes in the code on the differences? |
||
%s/kubectl exec ${POD_NAME}`, kubectlMountPath) | ||
}else if mpiJob.Spec.MPIDistribution == "MPICH" { | ||
kubexec = fmt.Sprintf(`#!/bin/sh | ||
set -x | ||
POD_NAME=$2 | ||
shift 2 | ||
%s/kubectl exec ${POD_NAME}`, kubectlMountPath) | ||
}else { | ||
kubexec = fmt.Sprintf(`#!/bin/sh | ||
set -x | ||
POD_NAME=$1 | ||
shift | ||
%s/kubectl exec ${POD_NAME}`, kubectlMountPath) | ||
} | ||
|
||
if len(mpiJob.Spec.MainContainer) > 0 { | ||
kubexec = fmt.Sprintf("%s --container %s", kubexec, mpiJob.Spec.MainContainer) | ||
} | ||
|
@@ -975,7 +991,11 @@ shift | |
} | ||
var buffer bytes.Buffer | ||
for i := 0; i < int(workerReplicas); i++ { | ||
buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d\n", mpiJob.Name, workerSuffix, i, slots)) | ||
if mpiJob.Spec.MPIDistribution == "IntelMPI" || mpiJob.Spec.MPIDistribution == "MPICH" { | ||
buffer.WriteString(fmt.Sprintf("%s%s-%d:%d\n", mpiJob.Name, workerSuffix, i, slots)) | ||
}else{ | ||
buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d\n", mpiJob.Name, workerSuffix, i, slots)) | ||
} | ||
} | ||
|
||
return &corev1.ConfigMap{ | ||
|
@@ -1278,13 +1298,25 @@ func (c *MPIJobController) newLauncher(mpiJob *kubeflow.MPIJob, kubectlDeliveryI | |
return nil | ||
} | ||
container := podSpec.Spec.Containers[0] | ||
var mpiRshExecPathEnvName string | ||
var mpiHostfilePathEnvName string | ||
if mpiJob.Spec.MPIDistribution == "IntelMPI" { | ||
mpiRshExecPathEnvName = "I_MPI_HYDRA_BOOTSTRAP_EXEC" | ||
mpiHostfilePathEnvName = "I_MPI_HYDRA_HOST_FILE" | ||
}else if mpiJob.Spec.MPIDistribution == "MPICH" { | ||
mpiRshExecPathEnvName = "HYDRA_LAUNCHER_EXEC" | ||
mpiHostfilePathEnvName = "HYDRA_HOST_FILE" | ||
}else{ | ||
mpiRshExecPathEnvName = "OMPI_MCA_plm_rsh_agent" | ||
mpiHostfilePathEnvName = "OMPI_MCA_orte_default_hostfile" | ||
} | ||
container.Env = append(container.Env, | ||
corev1.EnvVar{ | ||
Name: "OMPI_MCA_plm_rsh_agent", | ||
Name: mpiRshExecPathEnvName, | ||
Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), | ||
}, | ||
corev1.EnvVar{ | ||
Name: "OMPI_MCA_orte_default_hostfile", | ||
Name: mpiHostfilePathEnvName, | ||
Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName), | ||
}, | ||
// We overwrite these environment variables so that users will not | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mpi -> MPI