Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kueue] Wait for webhooks setup before ready. #1674

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions cmd/kueue/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package main

import (
"context"
"errors"
"flag"
"net/http"
"os"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand Down Expand Up @@ -162,12 +164,13 @@ func main() {
debugger.NewDumper(cCache, queues).ListenForSignal(ctx)

serverVersionFetcher := setupServerVersionFetcher(mgr, kubeConfig)
webhooksStartedChan := make(chan struct{})

setupProbeEndpoints(mgr)
setupProbeEndpoints(mgr, webhooksStartedChan)
// Cert won't be ready until manager starts, so start a goroutine here which
// will block until the cert is ready before setting up the controllers.
// Controllers who register after manager starts will start directly.
go setupControllers(mgr, cCache, queues, certsReady, &cfg, serverVersionFetcher)
go setupControllers(mgr, cCache, queues, certsReady, &cfg, serverVersionFetcher, webhooksStartedChan)

go func() {
queues.CleanUpOnContext(ctx)
Expand Down Expand Up @@ -218,7 +221,7 @@ func setupIndexes(ctx context.Context, mgr ctrl.Manager, cfg *configapi.Configur
return jobframework.SetupIndexes(ctx, mgr.GetFieldIndexer(), opts...)
}

func setupControllers(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, certsReady chan struct{}, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher) {
func setupControllers(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manager, certsReady chan struct{}, cfg *configapi.Configuration, serverVersionFetcher *kubeversion.ServerVersionFetcher, webhhoksStartedChan chan struct{}) {
// The controllers won't work until the webhooks are operating, and the webhook won't work until the
// certs are all in place.
cert.WaitForCertsReady(setupLog, certsReady)
Expand Down Expand Up @@ -267,18 +270,27 @@ func setupControllers(mgr ctrl.Manager, cCache *cache.Cache, queues *queue.Manag
setupLog.Error(err, "Unable to create controller or webhook", "kubernetesVersion", serverVersionFetcher.GetServerVersion())
os.Exit(1)
}

close(webhhoksStartedChan)
// +kubebuilder:scaffold:builder
}

// setupProbeEndpoints registers the health endpoints
func setupProbeEndpoints(mgr ctrl.Manager) {
func setupProbeEndpoints(mgr ctrl.Manager, webhhoksStartedChan chan struct{}) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
if err := mgr.AddReadyzCheck("readyz", func(_ *http.Request) error {
select {
case <-webhhoksStartedChan:
return nil
default:
return errors.New("webhooks not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
Expand Down
16 changes: 16 additions & 0 deletions hack/multikueue/manager-cluster.kind.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,18 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiVersion: kubeadm.k8s.io/v1beta3
scheduler:
extraArgs:
v: "2"
controllerManager:
extraArgs:
v: "2"
apiServer:
extraArgs:
enable-aggregator-routing: "true"
v: "5"
16 changes: 16 additions & 0 deletions hack/multikueue/worker-cluster.kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,19 @@ kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
apiServerAddress: "FILLED_AT_RUNTIME"
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiVersion: kubeadm.k8s.io/v1beta3
scheduler:
extraArgs:
v: "2"
controllerManager:
extraArgs:
v: "2"
apiServer:
extraArgs:
enable-aggregator-routing: "true"
v: "5"
4 changes: 2 additions & 2 deletions test/util/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ const (
// such as running pods to completion.
LongTimeout = 45 * time.Second
// StartupTimeout is meant to be used for waiting for Kueue to startup, given
// that cert updates can take up to 2 minutes to propagate to the filesystem.
StartUpTimeout = 3 * time.Minute
// that cert updates can take up to 3 minutes to propagate to the filesystem.
StartUpTimeout = 4 * time.Minute
ConsistentDuration = time.Second * 3
Interval = time.Millisecond * 250
)
20 changes: 14 additions & 6 deletions test/util/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,19 @@ func CreateVisibilityClient(user string) visibilityv1alpha1.VisibilityV1alpha1In
return visibilityClient
}

func KueueReadyForTesting(ctx context.Context, client client.Client) {
func KueueReadyForTesting(ctx context.Context, c client.Client) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we delete this function if we are waiting for the "webooks setup before ready"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, this is running in the test binary

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but should not the test binary wait until Kueue is healthy / ready, rather than trying to periodically create the items? If we still periodically need to create items I don't seem much gain of delaying the healthy status.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just another way of checking the same thing.

I theory you can check if all the kueue-controller-manager pods are ready instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, but this involves creating / deleting objects, something which users may not want to do in prod. I think it is better if e2e tests correspond better to production use. Also, the creation / deletion takes more time some may cover some user issues. If we go with this approach I would suggest to wait as for ready deployment, this can be done as here:

kubectl wait --for=condition=available --timeout=3m deployment/kueue-controller-manager -n kueue-system

// To verify that webhooks are ready, let's create a simple resourceflavor
resourceKueue := utiltesting.MakeResourceFlavor("default").Obj()
gomega.Eventually(func() error {
return client.Create(context.Background(), resourceKueue)
}, StartUpTimeout, Interval).Should(gomega.Succeed())
ExpectResourceFlavorToBeDeleted(ctx, client, resourceKueue, true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why change this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the reporting is hard to follow with that, in case of multikueue you don't know which of the clusters is not ready.

resourceKueue := utiltesting.MakeResourceFlavor("e2e-prepare").Obj()
gomega.EventuallyWithOffset(1, func() error {
return c.Create(ctx, resourceKueue)
}, StartUpTimeout, Interval).Should(gomega.Succeed(), "Cannot create the flavor")

gomega.EventuallyWithOffset(1, func() error {
oldRf := &kueue.ResourceFlavor{}
err := c.Get(ctx, client.ObjectKeyFromObject(resourceKueue), oldRf)
if err != nil {
return err
}
return c.Delete(ctx, oldRf)
}, LongTimeout, Interval).Should(utiltesting.BeNotFoundError(), "Cannot delete the flavor")
}