kubeflow · google-oss-prow · Dec 21, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 # Image URL to use all building/pushing image targets
 IMG ?= kubeflow/training-operator:latest
-# Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
-CRD_OPTIONS ?= "crd:trivialVersions=true,preserveUnknownFields=false,generateEmbeddedObjectMeta=true"
+# CRD generation options
+CRD_OPTIONS ?= "crd:generateEmbeddedObjectMeta=true"
 
 # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
 ifeq (,$(shell go env GOBIN))
@@ -109,7 +109,7 @@ PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
 
 CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
 controller-gen: ## Download controller-gen locally if necessary.
-	GOBIN=$(PROJECT_DIR)/bin go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.6.0
+	GOBIN=$(PROJECT_DIR)/bin go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.10.0
 
 KUSTOMIZE = $(shell pwd)/bin/kustomize
 kustomize: ## Download kustomize locally if necessary.

diff --git a/docs/api/kubeflow.org_v1_generated.asciidoc b/docs/api/kubeflow.org_v1_generated.asciidoc
@@ -55,7 +55,7 @@ Package v1 contains API Schema definitions for the kubeflow.org v1 API group
 | *`standalone`* __boolean__ | Start a local standalone rendezvous backend that is represented by a C10d TCP store on port 29400. Useful when launching single-node, multi-worker job. If specified --rdzv_backend, --rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly set values are ignored.
 | *`nProcPerNode`* __integer__ | Number of workers per node; supported values: [auto, cpu, gpu, int].
 | *`maxRestarts`* __integer__ | 
-| *`metrics`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#metricspec-v2beta2-autoscaling[$$MetricSpec$$] array__ | Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.
+| *`metrics`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#metricspec-v2-autoscaling[$$MetricSpec$$] array__ | Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.
 |===
 
 
@@ -213,7 +213,7 @@ MXJobSpec defines the desired state of MXJob
 | *`minReplicas`* __integer__ | minReplicas is the lower limit for the number of replicas to which the training job can scale down.  It defaults to null.
 | *`maxReplicas`* __integer__ | upper limit for the number of pods that can be set by the autoscaler; cannot be smaller than MinReplicas, defaults to null.
 | *`maxRestarts`* __integer__ | MaxRestarts is the limit for restart times of pods in elastic mode.
-| *`metrics`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#metricspec-v2beta2-autoscaling[$$MetricSpec$$] array__ | Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.
+| *`metrics`* __link:https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#metricspec-v2-autoscaling[$$MetricSpec$$] array__ | Metrics contains the specifications which are used to calculate the desired replica count (the maximum replica count across all metrics will be used).  The desired replica count is calculated with multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa.  See the individual metric source types for more information about how each type of metric must respond. If not set, the HPA will not be created.
 |===
 
 

diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -1,22 +1,13 @@
-ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
+ARG BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
 FROM $BASE_IMAGE
 
-# install utilities and dependencies
-RUN pip install classy-vision
-
 WORKDIR /workspace
 
 # download imagenet tiny for data
 RUN apt-get -q update && apt-get -q install -y wget unzip
 RUN wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip && unzip -q tiny-imagenet-200.zip -d data && rm tiny-imagenet-200.zip
 
 COPY . ./examples
-RUN chmod -R u+x ./examples/bin
-ENV PATH=/workspace/examples/bin:${PATH}
-
-# create a template classy project in /workspace/classy_vision
-# (see https://classyvision.ai/#quickstart)
-RUN classy-project classy_vision
 
 USER root
 ENTRYPOINT ["python", "-m", "torch.distributed.run"]

diff --git a/examples/pytorch/elastic/imagenet/imagenet.yaml b/examples/pytorch/elastic/imagenet/imagenet.yaml
@@ -6,8 +6,15 @@ spec:
   elasticPolicy:
     rdzvBackend: c10d
     minReplicas: 1
-    maxReplicas: 2
+    maxReplicas: 3
     maxRestarts: 100
+    metrics:
+      - type: Resource
+        resource:
+          name: cpu
+          target:
+            type: Utilization
+            averageUtilization: 80
   pytorchReplicaSpecs:
     Worker:
       replicas: 2
@@ -16,8 +23,11 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: kubeflow/pytorch-elastic-example-imagenet:1.0.0-sigterm
+              image: quay.io/johnugeorge/pytorch-elastic-example-imagenet:0.1
               imagePullPolicy: IfNotPresent
+              resources:
+                requests:
+                  cpu: 4
               env:
               - name: LOGLEVEL
                 value: DEBUG
@@ -27,7 +37,7 @@ spec:
                 - torch.distributed.run
                 - /workspace/examples/imagenet.py
                 - "--arch=resnet18"
-                - "--epochs=20"
+                - "--epochs=1"
                 - "--batch-size=32"
                 - "--workers=0"
                 - "/workspace/data/tiny-imagenet-200"
diff --git a/go.mod b/go.mod
@@ -4,7 +4,7 @@ go 1.19
 
 require (
 	github.com/go-logr/logr v1.2.3
-	github.com/kubeflow/common v0.4.3
+	github.com/kubeflow/common v0.4.4
 	github.com/onsi/ginkgo/v2 v2.1.6
 	github.com/onsi/gomega v1.20.1
 	github.com/prometheus/client_golang v1.12.2

diff --git a/go.sum b/go.sum
@@ -327,8 +327,8 @@ github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/kubeflow/common v0.4.3 h1:vVoOMNPOZK4wzZvQ4rsRLvC3SDi+J1fVKNHSXC/QRvU=
-github.com/kubeflow/common v0.4.3/go.mod h1:Qb/5aON7/OWVkN8OnjRqqT0i8X/XzMekRIZ8lkLosj4=
+github.com/kubeflow/common v0.4.4 h1:QG9IgOHIsaS1bq1DtfW/yxtKkBTl5sTjRCyNicjdo10=
+github.com/kubeflow/common v0.4.4/go.mod h1:di43u2m7DyuwnRDb7Kwz1nmA/nhpjnQ+K+gWCV/SPZk=
 github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=

diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json
@@ -24,7 +24,7 @@
           "type": "array",
           "items": {
             "default": {},
-            "$ref": "#/definitions/k8s.io.api.autoscaling.v2beta2.MetricSpec"
+            "$ref": "#/definitions/k8s.io.api.autoscaling.v2.MetricSpec"
           }
         },
         "minReplicas": {
@@ -253,7 +253,7 @@
           "type": "array",
           "items": {
             "default": {},
-            "$ref": "#/definitions/k8s.io.api.autoscaling.v2beta2.MetricSpec"
+            "$ref": "#/definitions/k8s.io.api.autoscaling.v2.MetricSpec"
           }
         },
         "minReplicas": {
@@ -715,7 +715,7 @@
         },
         "labelSelector": {
           "description": "A label selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty label selector matches all objects. A null label selector matches no objects.",
-          "$ref": "#/definitions/v1.LabelSelector"
+          "type": "string"
         },
         "succeeded": {
           "description": "The number of pods which reached phase Succeeded.",

diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml
@@ -85,3 +85,9 @@ rules:
       - podgroups
     verbs:
       - "*"
+  - apiGroups:
+      - autoscaling
+    resources:
+      - horizontalpodautoscalers
+    verbs:
+      - "*"