Skip to content

Commit

Permalink
CARRY:implement e2e happy path (kubeflow#2)
Browse files Browse the repository at this point in the history
Signed-off-by: ted chang <[email protected]>
  • Loading branch information
tedhtchang authored and astefanutti committed Apr 5, 2024
1 parent 66329f0 commit ed82cdb
Show file tree
Hide file tree
Showing 12 changed files with 968 additions and 38 deletions.
36 changes: 24 additions & 12 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ go 1.20
require (
github.com/go-logr/logr v1.4.1
github.com/google/go-cmp v0.6.0
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/onsi/ginkgo/v2 v2.15.0
github.com/onsi/gomega v1.31.1
github.com/project-codeflare/codeflare-common v0.0.0-20240404131119-10cc4982cd0e
github.com/prometheus/client_golang v1.18.0
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.4
Expand All @@ -25,57 +26,68 @@ require (
)

require (
github.com/aymerick/douceur v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.8.0 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonpointer v0.20.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang-jwt/jwt/v4 v4.5.0 // indirect
github.com/golang/glog v1.0.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/google/pprof v0.0.0-20230323073829-e72429f035bd // indirect
github.com/google/uuid v1.3.1 // indirect
github.com/gorilla/css v1.0.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/microcosm-cc/bluemonday v1.0.18 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect
github.com/openshift/api v0.0.0-20230213134911-7ba313770556 // indirect
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/project-codeflare/appwrapper v0.7.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/ray-project/kuberay/ray-operator v0.0.0-20231016183545-097828931d15 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/net v0.20.0 // indirect
golang.org/x/oauth2 v0.12.0 // indirect
golang.org/x/sys v0.16.0 // indirect
golang.org/x/term v0.15.0 // indirect
golang.org/x/term v0.16.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.16.1 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.29.0 // indirect
k8s.io/component-base v0.29.0 // indirect
k8s.io/component-base v0.29.1 // indirect
k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
Expand Down
649 changes: 627 additions & 22 deletions go.sum

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions hack/swagger/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module github.com/kubeflow/training-operator/hack/swagger

go 1.20
go 1.21

toolchain go1.22.1

require (
github.com/kubeflow/training-operator v0.0.0-00010101000000-000000000000
Expand All @@ -13,9 +15,9 @@ replace github.com/kubeflow/training-operator => ../../
require (
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonpointer v0.20.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
Expand All @@ -27,7 +29,7 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/net v0.20.0 // indirect
golang.org/x/sys v0.16.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
Expand Down
3 changes: 3 additions & 0 deletions hack/swagger/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ4
github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.20.0/go.mod h1:6PGzBjjIIumbLYysB73Klnms1mwnU4G3YHOECG3CedA=
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g=
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
Expand Down Expand Up @@ -83,6 +85,7 @@ golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwY
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down
23 changes: 23 additions & 0 deletions test/e2e/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"model_name_or_path": "bigscience/bloom-560m",
"training_data_path": "/etc/config/twitter_complaints_small.json",
"output_dir": "/tmp/out",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"logging_steps": 1.0,
"packing": false,
"include_tokens_per_second": true,
"response_template": "\n### Label:",
"dataset_text_field": "output",
"use_flash_attn": false,
"torch_dtype": "float32",
"peft_method": "pt",
"tokenizer_name_or_path": "bigscience/bloom"
}
147 changes: 147 additions & 0 deletions test/e2e/kfto_kueue_sft_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
Copyright 2023.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package e2e

import (
"testing"

"github.com/onsi/gomega"
. "github.com/onsi/gomega"
. "github.com/project-codeflare/codeflare-common/support"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
)

func PytorchJob(t Test, namespace, name string) func(g gomega.Gomega) *kftov1.PyTorchJob {
return func(g gomega.Gomega) *kftov1.PyTorchJob {
job, err := t.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
g.Expect(err).NotTo(gomega.HaveOccurred())
return job
}
}

func PytorchJobCondition(job *kftov1.PyTorchJob) string {
if len(job.Status.Conditions) == 0 {
return ""
}
return job.Status.Conditions[len(job.Status.Conditions)-1].Reason
}

func TestPytorchjobWithSFTtrainer(t *testing.T) {
test := With(t)
test.T().Parallel()

// Create a namespace
namespace := test.NewTestNamespace()
config := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
Name: "my-config",
Namespace: namespace.Name,
Labels: map[string]string{
"kueue.x-k8s.io/queue-name": "lq-trainer",
},
},
BinaryData: map[string][]byte{
"config.json": ReadFile(test, "config.json"),
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
},
Immutable: Ptr(true),
}

config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)

tuningJob := &kftov1.PyTorchJob{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "PyTorchJob",
},
ObjectMeta: metav1.ObjectMeta{
Name: "kfto-sft",
Namespace: namespace.Name,
},
Spec: kftov1.PyTorchJobSpec{
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
"Master": &kftov1.ReplicaSpec{
Replicas: Ptr(int32(1)),
RestartPolicy: "Never",
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "pytorch",
Image: "quay.io/tedchang/sft-trainer:dev",
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"python", "/app/launch_training.py"},
Env: []corev1.EnvVar{
{
Name: "SFT_TRAINER_CONFIG_JSON_PATH",
Value: "/etc/config/config.json",
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "config-volume",
MountPath: "/etc/config",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "config-volume",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: "my-config",
},
Items: []corev1.KeyToPath{
{
Key: "config.json",
Path: "config.json",
},
{
Key: "twitter_complaints_small.json",
Path: "twitter_complaints_small.json",
},
},
},
},
},
},
},
},
},
},
},
}

tuningJob, err = test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Create(test.Ctx(), tuningJob, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name)

test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PytorchJobCondition, Equal("PyTorchJobSucceeded")))
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
}
15 changes: 15 additions & 0 deletions test/e2e/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1
extraPortMappings:
- containerPort: 80
hostPort: 80
protocol: TCP
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
29 changes: 29 additions & 0 deletions test/e2e/kind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

# Copyright 2022 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail
: "${INGRESS_NGINX_VERSION:=controller-v1.9.6}"

BASE_DIR=`pwd`
TEST_DIR="${BASE_DIR}/test/e2e"

echo "Creating KinD cluster"
kind create cluster --name training-operator-cluster --config ${TEST_DIR}/kind-cluster-config.yaml

echo "Deploying Ingress controller into KinD cluster"
curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${INGRESS_NGINX_VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f -
kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true"
kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all
28 changes: 28 additions & 0 deletions test/e2e/kueue-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "cpu-flavor"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: "cq-small"
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["cpu", "memory"]
flavors:
- name: "cpu-flavor"
resources:
- name: "cpu"
nominalQuota: 5
- name: "memory"
nominalQuota: 20Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: lq-trainer
namespace: default
spec:
clusterQueue: cq-small
Loading

0 comments on commit ed82cdb

Please sign in to comment.