Skip to content

Commit

Permalink
Adopting coschduling plugin (#1724)
Browse files Browse the repository at this point in the history
* Add scheduler plugin for coscheduling

Signed-off-by: Yuki Iwai <[email protected]>
Co-authored-by: Wang Zhang <[email protected]>
Signed-off-by: Yuki Iwai <[email protected]>

* generate codes

Signed-off-by: Yuki Iwai <[email protected]>

* Add a custom Watcher for scheduler-plugins PodGroup to controllers

Signed-off-by: Yuki Iwai <[email protected]>

* Fix predicates of paddlepaddle-controller for scheduling.volcano.sh/v1beta1 PodGroup

Signed-off-by: Yuki Iwai <[email protected]>

* Upgrade kubeflow/common version to v0.4.6 with HPA changes

Signed-off-by: Yuki Iwai <[email protected]>

Signed-off-by: Yuki Iwai <[email protected]>
Co-authored-by: Wang Zhang <[email protected]>
  • Loading branch information
tenzen-y and zw0610 authored Jan 21, 2023
1 parent ed33bd9 commit 0126d96
Show file tree
Hide file tree
Showing 55 changed files with 789 additions and 262 deletions.
21 changes: 17 additions & 4 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"flag"
"fmt"
"os"
"strings"

"go.uber.org/zap/zapcore"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -29,8 +30,11 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"

"github.com/kubeflow/common/pkg/controller.v1/common"
commonutil "github.com/kubeflow/common/pkg/util"
kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/kubeflow/training-operator/pkg/config"
Expand All @@ -47,6 +51,7 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(kubeflowv1.AddToScheme(scheme))
utilruntime.Must(v1beta1.AddToScheme(scheme))
utilruntime.Must(schedulerpluginsv1alpha1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand All @@ -56,7 +61,6 @@ func main() {
var leaderElectionID string
var probeAddr string
var enabledSchemes controllerv1.EnabledSchemes
var enableGangScheduling bool
var gangSchedulerName string
var namespace string
var monitoringPort int
Expand All @@ -69,8 +73,7 @@ func main() {
flag.StringVar(&leaderElectionID, "leader-election-id", "1ca428e5.training-operator.kubeflow.org", "The ID for leader election.")
flag.Var(&enabledSchemes, "enable-scheme", "Enable scheme(s) as --enable-scheme=tfjob --enable-scheme=pytorchjob, case insensitive."+
" Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob, PaddleJob. By default, all supported schemes will be enabled.")
flag.BoolVar(&enableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling")
flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "volcano", "The scheduler to gang-schedule kubeflow jobs, defaults to volcano")
flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "none", "The scheduler to gang-schedule kubeflow jobs, defaults to none")
flag.StringVar(&namespace, "namespace", os.Getenv(commonutil.EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+
"If set, it only monitors kubeflow jobs in the given namespace.")
flag.IntVar(&monitoringPort, "monitoring-port", 9443, "Endpoint port for displaying monitoring metrics. "+
Expand Down Expand Up @@ -110,6 +113,16 @@ func main() {
os.Exit(1)
}

// Prepare GangSchedulingSetupFunc
gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc()
if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) {
cfg := mgr.GetConfig()
volcanoClientSet := volcanoclient.NewForConfigOrDie(cfg)
gangSchedulingSetupFunc = common.GenVolcanoSetupFunc(volcanoClientSet)
} else if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerSchedulerPlugins)) {
gangSchedulingSetupFunc = common.GenSchedulerPluginsSetupFunc(mgr.GetClient())
}

// TODO: We need a general manager. all rest reconciler addsToManager
// Based on the user configuration, we start different controllers
if enabledSchemes.Empty() {
Expand All @@ -122,7 +135,7 @@ func main() {
"scheme not supported", "scheme", s)
os.Exit(1)
}
if err = setupFunc(mgr, enableGangScheduling, controllerThreads); err != nil {
if err = setupFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
setupLog.Error(err, "unable to create controller", "controller", s)
os.Exit(1)
}
Expand Down
11 changes: 6 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.19

require (
github.com/go-logr/logr v1.2.3
github.com/kubeflow/common v0.4.5
github.com/kubeflow/common v0.4.6
github.com/onsi/ginkgo/v2 v2.1.6
github.com/onsi/gomega v1.20.1
github.com/prometheus/client_golang v1.12.2
Expand All @@ -18,6 +18,7 @@ require (
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed
sigs.k8s.io/controller-runtime v0.13.0
sigs.k8s.io/scheduler-plugins v0.24.9
sigs.k8s.io/yaml v1.3.0
volcano.sh/apis v1.2.0-k8s1.19.6
)
Expand Down Expand Up @@ -68,11 +69,11 @@ require (
go.uber.org/multierr v1.8.0 // indirect
golang.org/x/crypto v0.0.0-20220525230936-793ad666bf5e // indirect
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect
golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect
golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10 // indirect
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/sys v0.3.0 // indirect
golang.org/x/term v0.3.0 // indirect
golang.org/x/text v0.5.0 // indirect
golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect
golang.org/x/tools v0.1.12 // indirect
gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
Expand Down
22 changes: 12 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubeflow/common v0.4.5 h1:W7p+s/4Za1UzIgKP2Z6ormEvsUVHykeaXaOuu8+UgpI=
github.com/kubeflow/common v0.4.5/go.mod h1:di43u2m7DyuwnRDb7Kwz1nmA/nhpjnQ+K+gWCV/SPZk=
github.com/kubeflow/common v0.4.6 h1:yzJf/HEdS6ginD0GlVkgbOFie0Sp66VdGjXidAGZIlk=
github.com/kubeflow/common v0.4.6/go.mod h1:43MAof/uhpJA2C0urynqatE3oKFQc7m2HLmJty7waqY=
github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
Expand Down Expand Up @@ -555,8 +555,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b
golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b h1:PxfKdU9lEEDYjdIzOtC4qFWgkU2rGHdKlKowJSMN9h0=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10 h1:Frnccbp+ok2GkUS2tC84yAq/U9Vg+0sIO7aRL3T4Xnc=
golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
Expand Down Expand Up @@ -648,11 +648,11 @@ golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.3.0 h1:qoo4akIqOcDME5bhc/NgxUdovd6BSS2uMsVjB56q1xI=
golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
Expand All @@ -661,8 +661,8 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
Expand Down Expand Up @@ -954,6 +954,8 @@ sigs.k8s.io/controller-runtime v0.13.0 h1:iqa5RNciy7ADWnIc8QxCbOX5FEKVR3uxVxKHRM
sigs.k8s.io/controller-runtime v0.13.0/go.mod h1:Zbz+el8Yg31jubvAEyglRZGdLAjplZl+PgtYNI6WNTI=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/scheduler-plugins v0.24.9 h1:9oGtwk6uh7mZMCX8+O+PipQzBiRq9d2+E3xq1cn7zbc=
sigs.k8s.io/scheduler-plugins v0.24.9/go.mod h1:0u2b/0SwY2ozDhOD/f1S3e5IbStoDFLUK8yP5dJTaQ8=
sigs.k8s.io/structured-merge-diff/v4 v4.0.1/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=
Expand Down
10 changes: 9 additions & 1 deletion hack/python-sdk/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,11 @@
"format": "int32"
},
"labelSelector": {
"description": "A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.",
"description": "Deprecated: Use Selector instead",
"$ref": "#/definitions/v1.LabelSelector"
},
"selector": {
"description": "A Selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty Selector matches all objects. A null Selector matches no objects.",
"type": "string"
},
"succeeded": {
Expand Down Expand Up @@ -773,6 +777,10 @@
},
"queue": {
"type": "string"
},
"scheduleTimeoutSeconds": {
"type": "integer",
"format": "int32"
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions manifests/base/cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,9 @@ rules:
- horizontalpodautoscalers
verbs:
- "*"
- apiGroups:
- scheduling.sigs.k8s.io
resources:
- podgroups
verbs:
- "*"
56 changes: 52 additions & 4 deletions manifests/base/crds/kubeflow.org_mpijobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7397,6 +7397,9 @@ spec:
type: string
queue:
type: string
scheduleTimeoutSeconds:
format: int32
type: integer
type: object
ttlSecondsAfterFinished:
description: TTLSecondsAfterFinished is the TTL to clean up jobs.
Expand Down Expand Up @@ -7476,10 +7479,55 @@ spec:
format: int32
type: integer
labelSelector:
description: A label selector is a label query over a set of
resources. The result of matchLabels and matchExpressions
are ANDed. An empty label selector matches all objects. A
null label selector matches no objects.
description: 'Deprecated: Use Selector instead'
properties:
matchExpressions:
description: matchExpressions is a list of label selector
requirements. The requirements are ANDed.
items:
description: A label selector requirement is a selector
that contains values, a key, and an operator that relates
the key and values.
properties:
key:
description: key is the label key that the selector
applies to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In, NotIn,
Exists and DoesNotExist.
type: string
values:
description: values is an array of string values.
If the operator is In or NotIn, the values array
must be non-empty. If the operator is Exists or
DoesNotExist, the values array must be empty. This
array is replaced during a strategic merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs.
A single {key,value} in the matchLabels map is equivalent
to an element of matchExpressions, whose key field is
"key", the operator is "In", and the values array contains
only "value". The requirements are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
selector:
description: A Selector is a label query over a set of resources.
The result of matchLabels and matchExpressions are ANDed.
An empty Selector matches all objects. A null Selector matches
no objects.
type: string
succeeded:
description: The number of pods which reached phase Succeeded.
Expand Down
56 changes: 52 additions & 4 deletions manifests/base/crds/kubeflow.org_mxjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7397,6 +7397,9 @@ spec:
type: string
queue:
type: string
scheduleTimeoutSeconds:
format: int32
type: integer
type: object
ttlSecondsAfterFinished:
description: TTLSecondsAfterFinished is the TTL to clean up jobs.
Expand Down Expand Up @@ -7472,10 +7475,55 @@ spec:
format: int32
type: integer
labelSelector:
description: A label selector is a label query over a set of
resources. The result of matchLabels and matchExpressions
are ANDed. An empty label selector matches all objects. A
null label selector matches no objects.
description: 'Deprecated: Use Selector instead'
properties:
matchExpressions:
description: matchExpressions is a list of label selector
requirements. The requirements are ANDed.
items:
description: A label selector requirement is a selector
that contains values, a key, and an operator that relates
the key and values.
properties:
key:
description: key is the label key that the selector
applies to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In, NotIn,
Exists and DoesNotExist.
type: string
values:
description: values is an array of string values.
If the operator is In or NotIn, the values array
must be non-empty. If the operator is Exists or
DoesNotExist, the values array must be empty. This
array is replaced during a strategic merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs.
A single {key,value} in the matchLabels map is equivalent
to an element of matchExpressions, whose key field is
"key", the operator is "In", and the values array contains
only "value". The requirements are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
selector:
description: A Selector is a label query over a set of resources.
The result of matchLabels and matchExpressions are ANDed.
An empty Selector matches all objects. A null Selector matches
no objects.
type: string
succeeded:
description: The number of pods which reached phase Succeeded.
Expand Down
Loading

0 comments on commit 0126d96

Please sign in to comment.