Skip to content

Commit

Permalink
Move processing unit specific flags to MPIJobSpec
Browse files Browse the repository at this point in the history
  • Loading branch information
terrytangyuan committed Jan 20, 2019
1 parent cbc53f6 commit 036c760
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 5 deletions.
6 changes: 3 additions & 3 deletions cmd/mpi-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ func init() {
&gpusPerNode,
"gpus-per-node",
1,
"The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&kubectlDeliveryImage, "kubectl-delivery-image", "", "The container image used to deliver the kubectl binary.")
flag.StringVar(&namespace, "namespace", "", "The namespace used to obtain the listers.")
flag.IntVar(
&processingUnitsPerNode,
"processing-units-per-node",
1,
"The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "(Deprecated. This will be overwritten by MPIJobSpec) The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
}
26 changes: 25 additions & 1 deletion deploy/0-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ spec:
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
gpusPerNode:
title: The maximum number of GPUs available per node
description: Defaults to the number of GPUs per worker
type: integer
minimum: 1
required:
- gpus
- properties:
Expand All @@ -58,18 +63,37 @@ spec:
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingUnitsPerNode:
title: The maximum number of processing units available per node
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingResourceType:
title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
description: Defaults to 'nvidia.com/gpu'
type: string
enum:
- nvidia.com/gpu
- cpu
required:
- processingUnits
- properties:
replicas:
title: Total number of replicas
description: The GPU resource limit should be specified for each replica
description: The processing resource limit should be specified for each replica
type: integer
minimum: 1
slotsPerWorker:
title: The number of slots per worker used in hostfile
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingResourceType:
title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
description: Defaults to 'nvidia.com/gpu'
type: string
enum:
- nvidia.com/gpu
- cpu
required:
- replicas
19 changes: 18 additions & 1 deletion pkg/apis/kubeflow/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,28 @@ type MPIJobSpec struct {
// +optional
GPUs *int32 `json:"gpus,omitempty"`

// The maximum number of GPUs available per node.
// Note that this will be ignored if the GPU resources are explicitly
// specified in the MPIJob pod spec.
// This is deprecated in favor of `ProcessingUnitsPerNode` field.
GPUsPerNode *int32 `json:"gpusPerNode,omitempty"`

// Specifies the desired number of processing units the MPIJob should run on.
// Mutually exclusive with the `Replicas` field.
// +optional
ProcessingUnits *int32 `json:"processingUnits,omitempty"`

// The maximum number of processing units available per node.
// Note that this will be ignored if the processing resources are explicitly
// specified in the MPIJob pod spec.
// +optional
ProcessingUnitsPerNode *int32 `json:"processingUnitsPerNode,omitempty"`

// The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'.
// Defaults to 'nvidia.com/gpu'
// +optional
ProcessingResourceType string `json:"processingResourceType,omitempty"`

// Specifies the number of slots per worker used in hostfile.
// Defaults to the number of processing units per worker.
// +optional
Expand All @@ -71,7 +88,7 @@ type MPIJobSpec struct {
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`

// Specifies the desired number of replicas the MPIJob should run on.
// The `PodSpec` should specify the number of GPUs.
// The `PodSpec` should specify the number of processing units.
// Mutually exclusive with the `GPUs` or `ProcessingUnits` fields.
// +optional
Replicas *int32 `json:"replicas,omitempty"`
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions pkg/controllers/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,18 @@ func (c *MPIJobController) syncHandler(key string) error {
// We're done if the launcher either succeeded or failed.
done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1)

// TODO (terrytangyuan): Remove these flags from main.go for next major release
// and update deploy/*.yaml
if mpiJob.Spec.GPUsPerNode != nil {
c.gpusPerNode = int(*mpiJob.Spec.GPUsPerNode)
}
if mpiJob.Spec.ProcessingUnitsPerNode != nil {
c.processingUnitsPerNode = int(*mpiJob.Spec.ProcessingUnitsPerNode)
}
if mpiJob.Spec.ProcessingResourceType != "" {
c.processingResourceType = mpiJob.Spec.ProcessingResourceType
}

workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, c.gpusPerNode, c.processingUnitsPerNode, c.processingResourceType, done)
if err != nil {
runtime.HandleError(err)
Expand Down

0 comments on commit 036c760

Please sign in to comment.