Move processing unit specific flags to MPIJobSpec

kubeflow · Jan 20, 2019 · 036c760 · 036c760
1 parent cbc53f6
commit 036c760
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 5 deletions.
diff --git a/cmd/mpi-operator/main.go b/cmd/mpi-operator/main.go
@@ -99,13 +99,13 @@ func init() {
 		&gpusPerNode,
 		"gpus-per-node",
 		1,
-		"The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
+		"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
 	flag.StringVar(&kubectlDeliveryImage, "kubectl-delivery-image", "", "The container image used to deliver the kubectl binary.")
 	flag.StringVar(&namespace, "namespace", "", "The namespace used to obtain the listers.")
 	flag.IntVar(
 		&processingUnitsPerNode,
 		"processing-units-per-node",
 		1,
-		"The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
-	flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
+		"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
+	flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "(Deprecated. This will be overwritten by MPIJobSpec) The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
 }
diff --git a/deploy/0-crd.yaml b/deploy/0-crd.yaml
@@ -38,6 +38,11 @@ spec:
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              gpusPerNode:
+                title: The maximum number of GPUs available per node
+                description: Defaults to the number of GPUs per worker
+                type: integer
+                minimum: 1
             required:
             - gpus
           - properties:
@@ -58,18 +63,37 @@ spec:
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              processingUnitsPerNode:
+                title: The maximum number of processing units available per node
+                description: Defaults to the number of processing units per worker
+                type: integer
+                minimum: 1
+              processingResourceType:
+                title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
+                description: Defaults to 'nvidia.com/gpu'
+                type: string
+                enum:
+                  - nvidia.com/gpu
+                  - cpu
             required:
             - processingUnits
           - properties:
               replicas:
                 title: Total number of replicas
-                description: The GPU resource limit should be specified for each replica
+                description: The processing resource limit should be specified for each replica
                 type: integer
                 minimum: 1
               slotsPerWorker:
                 title: The number of slots per worker used in hostfile
                 description: Defaults to the number of processing units per worker
                 type: integer
                 minimum: 1
+              processingResourceType:
+                title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
+                description: Defaults to 'nvidia.com/gpu'
+                type: string
+                enum:
+                  - nvidia.com/gpu
+                  - cpu
             required:
             - replicas
diff --git a/pkg/apis/kubeflow/v1alpha1/types.go b/pkg/apis/kubeflow/v1alpha1/types.go
@@ -44,11 +44,28 @@ type MPIJobSpec struct {
 	// +optional
 	GPUs *int32 `json:"gpus,omitempty"`
 
+	// The maximum number of GPUs available per node.
+	// Note that this will be ignored if the GPU resources are explicitly
+	// specified in the MPIJob pod spec.
+	// This is deprecated in favor of `ProcessingUnitsPerNode` field.
+	GPUsPerNode *int32 `json:"gpusPerNode,omitempty"`
+
 	// Specifies the desired number of processing units the MPIJob should run on.
 	// Mutually exclusive with the `Replicas` field.
 	// +optional
 	ProcessingUnits *int32 `json:"processingUnits,omitempty"`
 
+	// The maximum number of processing units available per node.
+	// Note that this will be ignored if the processing resources are explicitly
+	// specified in the MPIJob pod spec.
+	// +optional
+	ProcessingUnitsPerNode *int32 `json:"processingUnitsPerNode,omitempty"`
+
+	// The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'.
+	// Defaults to 'nvidia.com/gpu'
+	// +optional
+	ProcessingResourceType string `json:"processingResourceType,omitempty"`
+
 	// Specifies the number of slots per worker used in hostfile.
 	// Defaults to the number of processing units per worker.
 	// +optional
@@ -71,7 +88,7 @@ type MPIJobSpec struct {
 	ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
 
 	// Specifies the desired number of replicas the MPIJob should run on.
-	// The `PodSpec` should specify the number of GPUs.
+	// The `PodSpec` should specify the number of processing units.
 	// Mutually exclusive with the `GPUs` or `ProcessingUnits` fields.
 	// +optional
 	Replicas *int32 `json:"replicas,omitempty"`

diff --git a/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/controllers/mpi_job_controller.go b/pkg/controllers/mpi_job_controller.go
@@ -415,6 +415,18 @@ func (c *MPIJobController) syncHandler(key string) error {
 	// We're done if the launcher either succeeded or failed.
 	done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1)
 
+	// TODO (terrytangyuan): Remove these flags from main.go for next major release
+	// and update deploy/*.yaml
+	if mpiJob.Spec.GPUsPerNode != nil {
+		c.gpusPerNode = int(*mpiJob.Spec.GPUsPerNode)
+	}
+	if mpiJob.Spec.ProcessingUnitsPerNode != nil {
+		c.processingUnitsPerNode = int(*mpiJob.Spec.ProcessingUnitsPerNode)
+	}
+	if mpiJob.Spec.ProcessingResourceType != "" {
+		c.processingResourceType = mpiJob.Spec.ProcessingResourceType
+	}
+
 	workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, c.gpusPerNode, c.processingUnitsPerNode, c.processingResourceType, done)
 	if err != nil {
 		runtime.HandleError(err)