Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move processing unit specific flags to MPIJobSpec #85

Merged
merged 2 commits into from
Jan 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cmd/mpi-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ func init() {
&gpusPerNode,
"gpus-per-node",
1,
"The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of GPUs available per node. Note that this will be ignored if the GPU resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&kubectlDeliveryImage, "kubectl-delivery-image", "", "The container image used to deliver the kubectl binary.")
flag.StringVar(&namespace, "namespace", "", "The namespace used to obtain the listers.")
flag.IntVar(
&processingUnitsPerNode,
"processing-units-per-node",
1,
"The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
"(Deprecated. This will be overwritten by MPIJobSpec) The maximum number of processing units available per node. Note that this will be ignored if the processing resources are explicitly specified in the MPIJob pod spec.")
flag.StringVar(&processingResourceType, "processing-resource-type", "nvidia.com/gpu", "(Deprecated. This will be overwritten by MPIJobSpec) The compute resource name, e.g. 'nvidia.com/gpu' or 'cpu'.")
}
26 changes: 25 additions & 1 deletion deploy/0-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ spec:
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
gpusPerNode:
title: The maximum number of GPUs available per node
description: Defaults to the number of GPUs per worker
type: integer
minimum: 1
required:
- gpus
- properties:
Expand All @@ -58,18 +63,37 @@ spec:
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingUnitsPerNode:
title: The maximum number of processing units available per node
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingResourceType:
title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
description: Defaults to 'nvidia.com/gpu'
type: string
enum:
- nvidia.com/gpu
- cpu
required:
- processingUnits
- properties:
replicas:
title: Total number of replicas
description: The GPU resource limit should be specified for each replica
description: The processing resource limit should be specified for each replica
type: integer
minimum: 1
slotsPerWorker:
title: The number of slots per worker used in hostfile
description: Defaults to the number of processing units per worker
type: integer
minimum: 1
processingResourceType:
title: The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'
description: Defaults to 'nvidia.com/gpu'
type: string
enum:
- nvidia.com/gpu
- cpu
required:
- replicas
19 changes: 18 additions & 1 deletion pkg/apis/kubeflow/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,28 @@ type MPIJobSpec struct {
// +optional
GPUs *int32 `json:"gpus,omitempty"`

// The maximum number of GPUs available per node.
// Note that this will be ignored if the GPU resources are explicitly
// specified in the MPIJob pod spec.
// This is deprecated in favor of `ProcessingUnitsPerNode` field.
GPUsPerNode *int32 `json:"gpusPerNode,omitempty"`

// Specifies the desired number of processing units the MPIJob should run on.
// Mutually exclusive with the `Replicas` field.
// +optional
ProcessingUnits *int32 `json:"processingUnits,omitempty"`

// The maximum number of processing units available per node.
// Note that this will be ignored if the processing resources are explicitly
// specified in the MPIJob pod spec.
// +optional
ProcessingUnitsPerNode *int32 `json:"processingUnitsPerNode,omitempty"`

// The processing resource type, e.g. 'nvidia.com/gpu' or 'cpu'.
// Defaults to 'nvidia.com/gpu'
// +optional
ProcessingResourceType string `json:"processingResourceType,omitempty"`

// Specifies the number of slots per worker used in hostfile.
// Defaults to the number of processing units per worker.
// +optional
Expand All @@ -71,7 +88,7 @@ type MPIJobSpec struct {
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`

// Specifies the desired number of replicas the MPIJob should run on.
// The `PodSpec` should specify the number of GPUs.
// The `PodSpec` should specify the number of processing units.
// Mutually exclusive with the `GPUs` or `ProcessingUnits` fields.
// +optional
Replicas *int32 `json:"replicas,omitempty"`
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/kubeflow/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 17 additions & 2 deletions pkg/controllers/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,22 @@ func (c *MPIJobController) syncHandler(key string) error {
// We're done if the launcher either succeeded or failed.
done := launcher != nil && (launcher.Status.Succeeded == 1 || launcher.Status.Failed == 1)

workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, c.gpusPerNode, c.processingUnitsPerNode, c.processingResourceType, done)
// TODO (terrytangyuan): Remove these flags from main.go for next major release
// and update deploy/*.yaml
var gpusPerNode = c.gpusPerNode
var processingUnitsPerNode = c.processingUnitsPerNode
var processingResourceType = c.processingResourceType
if mpiJob.Spec.GPUsPerNode != nil {
gpusPerNode = int(*mpiJob.Spec.GPUsPerNode)
}
if mpiJob.Spec.ProcessingUnitsPerNode != nil {
processingUnitsPerNode = int(*mpiJob.Spec.ProcessingUnitsPerNode)
}
if mpiJob.Spec.ProcessingResourceType != "" {
processingResourceType = mpiJob.Spec.ProcessingResourceType
}

workerReplicas, processingUnitsPerWorker, err := allocateProcessingUnits(mpiJob, gpusPerNode, processingUnitsPerNode, processingResourceType, done)
if err != nil {
runtime.HandleError(err)
return nil
Expand Down Expand Up @@ -443,7 +458,7 @@ func (c *MPIJobController) syncHandler(key string) error {
}
}

worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas, processingUnitsPerWorker, c.processingResourceType)
worker, err := c.getOrCreateWorkerStatefulSet(mpiJob, workerReplicas, processingUnitsPerWorker, processingResourceType)
if err != nil {
return err
}
Expand Down