kubeflow · k8s-ci-robot · Jan 21, 2019 · Jan 20, 2019 · Jan 21, 2019
diff --git a/cmd/mpi-operator/main.go b/cmd/mpi-operator/main.go
@@ -99,13 +99,13 @@ func init() {
 		&gpusPerNode,
 		"gpus-per-node",
 		1,
-		"The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
+		"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
 	flag.StringVar(&kubectlDeliveryImage, "kubectl-delivery-image", "", "The container image used to deliver the kubectl binary.")
 	flag.StringVar(&namespace, "namespace", "", "The namespace used to obtain the listers.")
 	flag.IntVar(
 		&processingUnitsPerNode,
 		"processing-units-per-node",
 		1,
-		"The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
-	flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
+		"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
+	flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "(Deprecated. This will be overwritten by MPIJobSpec) The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
 }
diff --git a/deploy/0-crd.yaml b/deploy/0-crd.yaml
@@ -38,6 +38,11 @@ spec:
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              gpusPerNode:
+                title: The maximum number of GPUs available per node
+                description: Defaults to the number of GPUs per worker
+                type: integer
+                minimum: 1
             required:
             - gpus
           - properties:
@@ -58,18 +63,37 @@ spec:
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              processingUnitsPerNode:
+                title: The maximum number of processing units available per node
+                description: Defaults to the number of processing units per worker
+                type: integer
+                minimum: 1
+              processingResourceType:
+                title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
+                description: Defaults to 'nvidia.com/gpu'
+                type: string
+                enum:
+                  - nvidia.com/gpu
+                  - cpu
             required:
             - processingUnits
           - properties:
               replicas:
                 title: Total number of replicas
-                description: The GPU resource limit should be specified for each replica
+                description: The processing resource limit should be specified for each replica
                 type: integer
                 minimum: 1
               slotsPerWorker:
                 title: The number of slots per worker used in hostfile
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              processingResourceType:
+                title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
+                description: Defaults to 'nvidia.com/gpu'
+                type: string
+                enum:
+                  - nvidia.com/gpu
+                  - cpu
             required:
             - replicas
diff --git a/pkg/apis/kubeflow/v1alpha1/types.go b/pkg/apis/kubeflow/v1alpha1/types.go
@@ -44,11 +44,28 @@ type MPIJobSpec struct {
 	// +optional
 	GPUs *int32 `json:"gpus,omitempty"`
 
+	// The maximum number of GPUs available per node.
+	// Note that this will be ignored if the GPU resources are explicitly
+	// specified in the MPIJob pod spec.
+	// This is deprecated in favor of `ProcessingUnitsPerNode` field.
+	GPUsPerNode *int32 `json:"gpusPerNode,omitempty"`
+
 	// Specifies the desired number of processing units the MPIJob should run on.
 	// Mutually exclusive with the `Replicas` field.
 	// +optional
 	ProcessingUnits *int32 `json:"processingUnits,omitempty"`
 
+	// The maximum number of processing units available per node.
+	// Note that this will be ignored if the processing resources are explicitly
+	// specified in the MPIJob pod spec.
+	// +optional
+	ProcessingUnitsPerNode *int32 `json:"processingUnitsPerNode,omitempty"`
+
+	// The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'.
+	// Defaults to 'nvidia.com/gpu'
+	// +optional
+	ProcessingResourceType string `json:"processingResourceType,omitempty"`
+
 	// Specifies the number of slots per worker used in hostfile.
 	// Defaults to the number of processing units per worker.
 	// +optional
@@ -71,7 +88,7 @@ type MPIJobSpec struct {
 	ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
 
 	// Specifies the desired number of replicas the MPIJob should run on.
-	// The `PodSpec` should specify the number of GPUs.
+	// The `PodSpec` should specify the number of processing units.
 	// Mutually exclusive with the `GPUs` or `ProcessingUnits` fields.
 	// +optional
 	Replicas *int32 `json:"replicas,omitempty"`

diff --git a/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/controllers/mpi_job_controller.go b/pkg/controllers/mpi_job_controller.go
@@ -415,7 +415,22 @@ func (c *MPIJobController) syncHandler(key string) error {
 	// We're done if the launcher either succeeded or failed.
 	done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1)
 
-	workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, c.gpusPerNode, c.processingUnitsPerNode, c.processingResourceType, done)
+	// TODO (terrytangyuan): Remove these flags from main.go for next major release
+	// and update deploy/*.yaml
+	var gpusPerNode = c.gpusPerNode
+	var processingUnitsPerNode = c.processingUnitsPerNode
+	var processingResourceType = c.processingResourceType
+	if mpiJob.Spec.GPUsPerNode != nil {
+		gpusPerNode = int(*mpiJob.Spec.GPUsPerNode)
+	}
+	if mpiJob.Spec.ProcessingUnitsPerNode != nil {
+		processingUnitsPerNode = int(*mpiJob.Spec.ProcessingUnitsPerNode)
+	}
+	if mpiJob.Spec.ProcessingResourceType != "" {
+		processingResourceType = mpiJob.Spec.ProcessingResourceType
+	}
+
+	workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, gpusPerNode, processingUnitsPerNode, processingResourceType, done)
 	if err != nil {
 		runtime.HandleError(err)
 		return nil
@@ -443,7 +458,7 @@ func (c *MPIJobController) syncHandler(key string) error {
 		}
 	}
 
-	worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas, processingUnitsPerWorker, c.processingResourceType)
+	worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas, processingUnitsPerWorker, processingResourceType)
 	if err != nil {
 		return err
 	}