-
Notifications
You must be signed in to change notification settings - Fork 121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use healthcheck cron job to determine cluster readiness #743
Changes from 13 commits
26f7164
fd7fb75
f8f10e6
420e4f1
cd7b67e
c6dc076
2c1a44c
1325f9e
0d93eec
8fef0ba
359eae4
dd9dac3
6de4970
7dda5b8
71de624
e0e0143
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package healthchecks | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"log" | ||
|
||
"github.com/openshift/osde2e/pkg/common/logging" | ||
batchv1 "k8s.io/api/batch/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/watch" | ||
"k8s.io/client-go/kubernetes" | ||
) | ||
|
||
// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster | ||
// is not ready, it will return an error. | ||
func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error { | ||
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) | ||
|
||
logger.Print("Checking whether cluster is healthy before proceeding...") | ||
|
||
bv1C := k8sClient.BatchV1() | ||
namespace := "openshift-monitoring" | ||
name := "osd-cluster-ready" | ||
jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{}) | ||
if err != nil { | ||
return fmt.Errorf("failed listing jobs: %w", err) | ||
} | ||
for _, job := range jobs.Items { | ||
if job.Name != name { | ||
continue | ||
} | ||
if job.Status.Succeeded > 0 { | ||
log.Println("Healthcheck job has already succeeded") | ||
return nil | ||
} | ||
log.Println("Healthcheck job has not yet succeeded, watching...") | ||
} | ||
watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{ | ||
ResourceVersion: jobs.ResourceVersion, | ||
FieldSelector: "metadata.name=osd-cluster-ready", | ||
}) | ||
if err != nil { | ||
return fmt.Errorf("failed watching job: %w", err) | ||
} | ||
for { | ||
select { | ||
case event := <-watcher.ResultChan(): | ||
switch event.Type { | ||
case watch.Added: | ||
fallthrough | ||
case watch.Modified: | ||
job := event.Object.(*batchv1.Job) | ||
if job.Status.Succeeded > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to handle "completed but failed" as well. ...But how we handle it is different before vs after openshift/configure-alertmanager-operator#143
Need to think through this some more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll wait to hear your updated thoughts on how this should work before altering this logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Talked this through with @jharrington22 just now. We're going to rework the readiness side so the picture is clear. Specific to this case, I think for osde2e's purposes "completed but failed" will translate to "cluster not ready, ain't ever gonna be". But I'm also pretty sure we're going to use prometheus to carry that state... which would mean substantial changes in this PR. Stay tuned -- I hope to have something written up by tomorrow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep an eye on OSD-6646 starting here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Completed but failed" should be treated as "cluster not ready, ain't ever gonna be". I'll post more info in a top-level comment. |
||
return nil | ||
} | ||
if job.Status.Failed > 0 { | ||
return fmt.Errorf("cluster readiness job failed") | ||
} | ||
case watch.Deleted: | ||
return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)") | ||
case watch.Error: | ||
return fmt.Errorf("watch returned error event: %v", event) | ||
default: | ||
logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type) | ||
} | ||
case <-ctx.Done(): | ||
return fmt.Errorf("healtcheck watch context cancelled while still waiting for success") | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ package e2e | |
import ( | ||
"bytes" | ||
"compress/gzip" | ||
"context" | ||
"encoding/json" | ||
"encoding/xml" | ||
"fmt" | ||
|
@@ -19,6 +20,7 @@ import ( | |
"github.com/hpcloud/tail" | ||
junit "github.com/joshdk/go-junit" | ||
vegeta "github.com/tsenart/vegeta/lib" | ||
"k8s.io/client-go/kubernetes" | ||
|
||
pd "github.com/PagerDuty/go-pagerduty" | ||
"github.com/onsi/ginkgo" | ||
|
@@ -31,6 +33,7 @@ import ( | |
"github.com/openshift/osde2e/pkg/common/aws" | ||
"github.com/openshift/osde2e/pkg/common/cluster" | ||
clusterutil "github.com/openshift/osde2e/pkg/common/cluster" | ||
"github.com/openshift/osde2e/pkg/common/cluster/healthchecks" | ||
"github.com/openshift/osde2e/pkg/common/clusterproperties" | ||
"github.com/openshift/osde2e/pkg/common/config" | ||
"github.com/openshift/osde2e/pkg/common/events" | ||
|
@@ -121,13 +124,32 @@ func beforeSuite() bool { | |
log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err) | ||
} | ||
|
||
err = clusterutil.WaitForClusterReady(cluster.ID(), nil) | ||
clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID()) | ||
if err != nil { | ||
log.Printf("Failed looking up cluster config for healthcheck: %v", err) | ||
} | ||
kubeClient, err := kubernetes.NewForConfig(clusterConfig) | ||
if err != nil { | ||
log.Printf("Error generating Kube Clientset: %v\n", err) | ||
} | ||
duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) | ||
if err != nil { | ||
log.Printf("Failed parsing health check timeout: %v", err) | ||
return false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool. Latent, but it looks like the other two conditions above might be missing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, I am dumb. I had tunnel vision while making this change. |
||
} | ||
ctx, cancel := context.WithTimeout(context.Background(), duration) | ||
defer cancel() | ||
if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" { | ||
log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") | ||
} | ||
err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) | ||
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed) | ||
if err != nil { | ||
log.Printf("Cluster failed health check: %v", err) | ||
getLogs() | ||
return false | ||
} | ||
log.Println("Cluster is healthy and ready for testing") | ||
|
||
if len(viper.GetString(config.Addons.IDs)) > 0 { | ||
if viper.GetString(config.Provider) != "mock" { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not super familiar with client-go, but why doesn't
bv1C.Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
work here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sigh... It does. I just did it a dumb way because I too am not very familiar with client-go.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wait! On second thought, it does need to be a list. I need the resourceVersion field of the list itself in order to initiate the watch later. This allows me to detect the creation and deletion of jobs matching my criteria since I performed the list.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I understand.
If you use a zero resourceVersion, do you even need the initial get/list?