From 26f7164702f5759159f7e3e6439d1fb4002b0072 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Wed, 10 Mar 2021 13:50:58 -0500 Subject: [PATCH 01/15] feat: provide a healthcheck that replies on cron job Signed-off-by: Chris Waldon --- .../cluster/healthchecks/healthcheckjob.go | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 pkg/common/cluster/healthchecks/healthcheckjob.go diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go new file mode 100644 index 0000000000..4c327edc43 --- /dev/null +++ b/pkg/common/cluster/healthchecks/healthcheckjob.go @@ -0,0 +1,58 @@ +package healthchecks + +import ( + "context" + "fmt" + "log" + + "github.com/openshift/osde2e/pkg/common/logging" + batchv1 "k8s.io/api/batch/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" + batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1" + v1 "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness. +func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logger *log.Logger) (bool, error) { + logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) + + logger.Print("Checking that all Nodes are running or completed...") + + bv1C := batchv1client.New(k8sClient.RESTClient()) + watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{ + Watch: true, + FieldSelector: "metadata.name=osd-cluster-ready", + }) + if err != nil { + if errors.IsNotFound(err) { + // Job doesn't exist yet + return false, nil + } + // Failed checking for job + return false, fmt.Errorf("failed looking up job: %w", err) + } + for { + select { + case event := <-watcher.ResultChan(): + switch event.Type { + case watch.Added: + fallthrough + case watch.Modified: + job := event.Object.(*batchv1.Job) + if job.Status.Succeeded > 0 { + return true, nil + } + case watch.Deleted: + return false, fmt.Errorf("cluster readiness job deleted before becoming ready") + case watch.Error: + return false, fmt.Errorf("watch returned error event: %v", event) + default: + logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type) + } + case <-ctx.Done(): + return false, fmt.Errorf("healtcheck watch context cancelled while still waiting for success") + } + } +} From fd7fb75620355ed349939fc89d317222dd64da3d Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Wed, 10 Mar 2021 15:48:00 -0500 Subject: [PATCH 02/15] wip: attempt to watch healthcheck job to determine cluster readiness This currently always fails, and I really have no idea why. Signed-off-by: Chris Waldon --- pkg/common/cluster/clusterutil.go | 53 ++++++++++--------- .../cluster/healthchecks/healthcheckjob.go | 10 ++-- pkg/e2e/e2e.go | 18 ++++++- 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go index e984060a14..57548d3021 100644 --- a/pkg/common/cluster/clusterutil.go +++ b/pkg/common/cluster/clusterutil.go @@ -207,41 +207,46 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l return nil } -// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not -// param clusterID: If specified, Provider will be discovered through OCM. If the empty string, -// assume we are running in a cluster and use in-cluster REST config instead. -func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) { - logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) - - logger.Print("Polling Cluster Health...\n") - - var restConfig *rest.Config - var providerType string - +func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) { if clusterID == "" { if restConfig, err = rest.InClusterConfig(); err != nil { - logger.Printf("Error getting in-cluster REST config: %v\n", err) - return false, nil, nil + return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err) } // FIXME: Is there a way to discover this from within the cluster? // For now, ocm and rosa behave the same, so hardcode either. providerType = "ocm" + return - } else { - provider, err := providers.ClusterProvider() + } + provider, err := providers.ClusterProvider() - if err != nil { - return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err) - } + if err != nil { + return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err) + } + providerType = provider.Type() - restConfig, err = getRestConfig(provider, clusterID) - if err != nil { - logger.Printf("Error generating Rest Config: %v\n", err) - return false, nil, nil - } + restConfig, err = getRestConfig(provider, clusterID) + if err != nil { + + return nil, "", fmt.Errorf("error generating rest config: %w", err) + } + + return +} + +// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not +// param clusterID: If specified, Provider will be discovered through OCM. If the empty string, +// assume we are running in a cluster and use in-cluster REST config instead. +func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) { + logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) - providerType = provider.Type() + logger.Print("Polling Cluster Health...\n") + + restConfig, providerType, err := ClusterConfig(clusterID) + if err != nil { + logger.Printf("Error getting cluster config: %v\n", err) + return false, nil, nil } kubeClient, err := kubernetes.NewForConfig(restConfig) diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go index 4c327edc43..4d127b4a52 100644 --- a/pkg/common/cluster/healthchecks/healthcheckjob.go +++ b/pkg/common/cluster/healthchecks/healthcheckjob.go @@ -7,7 +7,6 @@ import ( "github.com/openshift/osde2e/pkg/common/logging" batchv1 "k8s.io/api/batch/v1" - "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/watch" batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1" @@ -22,14 +21,11 @@ func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logg bv1C := batchv1client.New(k8sClient.RESTClient()) watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{ - Watch: true, - FieldSelector: "metadata.name=osd-cluster-ready", + Watch: true, + FieldSelector: "metadata.name=osd-cluster-ready", + ResourceVersion: "0", }) if err != nil { - if errors.IsNotFound(err) { - // Job doesn't exist yet - return false, nil - } // Failed checking for job return false, fmt.Errorf("failed looking up job: %w", err) } diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index 60da6a1dba..fbaa9ec80a 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -4,6 +4,7 @@ package e2e import ( "bytes" "compress/gzip" + "context" "encoding/json" "encoding/xml" "fmt" @@ -19,6 +20,7 @@ import ( "github.com/hpcloud/tail" junit "github.com/joshdk/go-junit" vegeta "github.com/tsenart/vegeta/lib" + "k8s.io/client-go/kubernetes" pd "github.com/PagerDuty/go-pagerduty" "github.com/onsi/ginkgo" @@ -32,6 +34,7 @@ import ( "github.com/openshift/osde2e/pkg/common/aws" "github.com/openshift/osde2e/pkg/common/cluster" clusterutil "github.com/openshift/osde2e/pkg/common/cluster" + "github.com/openshift/osde2e/pkg/common/cluster/healthchecks" "github.com/openshift/osde2e/pkg/common/clusterproperties" "github.com/openshift/osde2e/pkg/common/config" "github.com/openshift/osde2e/pkg/common/events" @@ -121,7 +124,20 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err) } - err = clusterutil.WaitForClusterReady(cluster.ID(), nil) + clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID()) + if err != nil { + log.Printf("Failed looking up cluster config for healthcheck: %v", err) + } + kubeClient, err := kubernetes.NewForConfig(clusterConfig) + if err != nil { + log.Printf("Error generating Kube Clientset: %v\n", err) + } + + ctx, _ := context.WithTimeout(context.Background(), time.Hour*2) + ready, err := healthchecks.CheckHealthcheckJob(kubeClient.CoreV1(), ctx, nil) + if !ready && err == nil { + err = fmt.Errorf("Cluster not ready") + } events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check") if err != nil { getLogs() From f8f10e6129d18f9a33196f7aa87ddae8f27a868f Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Thu, 11 Mar 2021 15:55:42 -0500 Subject: [PATCH 03/15] feat: fix watch usage and pass proper client Previously, everything was failing because I was passing the wrong instance of rest.Interface into the function (the one powering the CoreV1() api is not the same as the one for BatchV1()). That mistake caused the problems I was encountering in the past. I have also rewritten the usage of Watch() to properly provide a resourceversion so that we see all changes since the initial list operation. Signed-off-by: Chris Waldon --- .../cluster/healthchecks/healthcheckjob.go | 31 +++++++++++++------ pkg/e2e/e2e.go | 3 +- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go index 4d127b4a52..00e1e85dc9 100644 --- a/pkg/common/cluster/healthchecks/healthcheckjob.go +++ b/pkg/common/cluster/healthchecks/healthcheckjob.go @@ -9,25 +9,38 @@ import ( batchv1 "k8s.io/api/batch/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/watch" - batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1" - v1 "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/kubernetes" ) // CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness. -func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logger *log.Logger) (bool, error) { +func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) { logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) logger.Print("Checking that all Nodes are running or completed...") - bv1C := batchv1client.New(k8sClient.RESTClient()) - watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{ - Watch: true, + bv1C := k8sClient.BatchV1() + namespace := "openshift-monitoring" + name := "osd-cluster-ready" + jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return false, fmt.Errorf("failed listing jobs: %w", err) + } + for _, job := range jobs.Items { + if job.Name != name { + continue + } + if job.Status.Succeeded > 0 { + log.Println("Healthcheck job has already succeeded") + return true, nil + } + log.Println("Healthcheck job has not yet succeeded, watching...") + } + watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{ + ResourceVersion: jobs.ResourceVersion, FieldSelector: "metadata.name=osd-cluster-ready", - ResourceVersion: "0", }) if err != nil { - // Failed checking for job - return false, fmt.Errorf("failed looking up job: %w", err) + return false, fmt.Errorf("failed watching job: %w", err) } for { select { diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index fbaa9ec80a..a472446c8f 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -132,9 +132,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { if err != nil { log.Printf("Error generating Kube Clientset: %v\n", err) } - ctx, _ := context.WithTimeout(context.Background(), time.Hour*2) - ready, err := healthchecks.CheckHealthcheckJob(kubeClient.CoreV1(), ctx, nil) + ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) if !ready && err == nil { err = fmt.Errorf("Cluster not ready") } From 420e4f13f1945f1b9bd7b1414a2fa28b8c169deb Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Fri, 12 Mar 2021 09:28:42 -0500 Subject: [PATCH 04/15] feat: log when health check passes Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index a472446c8f..d1e938ad64 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -137,6 +137,9 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { if !ready && err == nil { err = fmt.Errorf("Cluster not ready") } + if ready { + log.Println("Cluster is healthy and ready for testing") + } events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check") if err != nil { getLogs() From cd7b67e88191a432b75ff4c70aba34757d710f7d Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Fri, 12 Mar 2021 10:12:25 -0500 Subject: [PATCH 05/15] fix: ensure we cancel context to avoid leak Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index d1e938ad64..82021c2068 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -132,7 +132,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { if err != nil { log.Printf("Error generating Kube Clientset: %v\n", err) } - ctx, _ := context.WithTimeout(context.Background(), time.Hour*2) + ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2) + defer cancel() ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) if !ready && err == nil { err = fmt.Errorf("Cluster not ready") From c6dc076c09b82e9bed6aad9b3b7f6a0a4179760a Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Mon, 15 Mar 2021 12:23:08 -0400 Subject: [PATCH 06/15] feat: warn that skip health checks config is a no-op Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index 82021c2068..995afc742c 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -134,6 +134,9 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { } ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2) defer cancel() + if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" { + log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") + } ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) if !ready && err == nil { err = fmt.Errorf("Cluster not ready") From 2c1a44cadfa29902a60a5611b79ecc6188c8c330 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Tue, 16 Mar 2021 14:39:20 -0400 Subject: [PATCH 07/15] fix: improve docs and log output as per code review Signed-off-by: Chris Waldon --- pkg/common/cluster/clusterutil.go | 4 ++++ pkg/common/cluster/healthchecks/healthcheckjob.go | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go index 57548d3021..5be6245549 100644 --- a/pkg/common/cluster/clusterutil.go +++ b/pkg/common/cluster/clusterutil.go @@ -207,6 +207,10 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l return nil } +// ClusterConfig returns the rest API config for a given cluster as well as the provider it +// inferred to discover the config. +// param clusterID: If specified, Provider will be discovered through OCM. If the empty string, +// assume we are running in a cluster and use in-cluster REST config instead. func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) { if clusterID == "" { if restConfig, err = rest.InClusterConfig(); err != nil { diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go index 00e1e85dc9..f94d31d3b0 100644 --- a/pkg/common/cluster/healthchecks/healthcheckjob.go +++ b/pkg/common/cluster/healthchecks/healthcheckjob.go @@ -12,11 +12,11 @@ import ( "k8s.io/client-go/kubernetes" ) -// CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness. +// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) { logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) - logger.Print("Checking that all Nodes are running or completed...") + logger.Print("Checking whether cluster is healthy before proceeding...") bv1C := k8sClient.BatchV1() namespace := "openshift-monitoring" From 1325f9e9e3b0d4bb0a15034613beb7ab75ccb76e Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Mon, 22 Mar 2021 14:16:57 -0400 Subject: [PATCH 08/15] feat: update job monitoring logic to match new expected behavior Signed-off-by: Chris Waldon --- .../cluster/healthchecks/healthcheckjob.go | 22 +++++++++++-------- pkg/e2e/e2e.go | 9 ++------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go index f94d31d3b0..4ab007fd43 100644 --- a/pkg/common/cluster/healthchecks/healthcheckjob.go +++ b/pkg/common/cluster/healthchecks/healthcheckjob.go @@ -12,8 +12,9 @@ import ( "k8s.io/client-go/kubernetes" ) -// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. -func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) { +// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster +// is not ready, it will return an error. +func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error { logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger) logger.Print("Checking whether cluster is healthy before proceeding...") @@ -23,7 +24,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l name := "osd-cluster-ready" jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{}) if err != nil { - return false, fmt.Errorf("failed listing jobs: %w", err) + return fmt.Errorf("failed listing jobs: %w", err) } for _, job := range jobs.Items { if job.Name != name { @@ -31,7 +32,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l } if job.Status.Succeeded > 0 { log.Println("Healthcheck job has already succeeded") - return true, nil + return nil } log.Println("Healthcheck job has not yet succeeded, watching...") } @@ -40,7 +41,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l FieldSelector: "metadata.name=osd-cluster-ready", }) if err != nil { - return false, fmt.Errorf("failed watching job: %w", err) + return fmt.Errorf("failed watching job: %w", err) } for { select { @@ -51,17 +52,20 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l case watch.Modified: job := event.Object.(*batchv1.Job) if job.Status.Succeeded > 0 { - return true, nil + return nil + } + if job.Status.Failed > 0 { + return fmt.Errorf("cluster readiness job failed") } case watch.Deleted: - return false, fmt.Errorf("cluster readiness job deleted before becoming ready") + return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)") case watch.Error: - return false, fmt.Errorf("watch returned error event: %v", event) + return fmt.Errorf("watch returned error event: %v", event) default: logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type) } case <-ctx.Done(): - return false, fmt.Errorf("healtcheck watch context cancelled while still waiting for success") + return fmt.Errorf("healtcheck watch context cancelled while still waiting for success") } } } diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index 995afc742c..ae5b9d2c72 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -137,18 +137,13 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" { log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") } - ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) - if !ready && err == nil { - err = fmt.Errorf("Cluster not ready") - } - if ready { - log.Println("Cluster is healthy and ready for testing") - } + err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check") if err != nil { getLogs() return []byte{} } + log.Println("Cluster is healthy and ready for testing") if len(viper.GetString(config.Addons.IDs)) > 0 { if viper.GetString(config.Provider) != "mock" { From 0d93eecfce899141c60b04869c819fbe7733db13 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Mon, 29 Mar 2021 13:17:15 -0400 Subject: [PATCH 09/15] feat: add config knob for healthcheck timeout Signed-off-by: Chris Waldon --- pkg/common/config/config.go | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pkg/common/config/config.go b/pkg/common/config/config.go index 1d393092c7..33e24d312f 100644 --- a/pkg/common/config/config.go +++ b/pkg/common/config/config.go @@ -175,6 +175,12 @@ var Tests = struct { // Env: SKIP_CLUSTER_HEALTH_CHECKS SkipClusterHealthChecks string + // ClusterHealthChecksTimeout defines the duration for which the harness will + // wait for the cluster to indicate it is healthy before cancelling the test + // run. This value should be formatted for use with time.ParseDuration. + // Env: CLUSTER_HEALTH_CHECKS_TIMEOUT + ClusterHealthChecksTimeout string + // MetricsBucket is the bucket that metrics data will be uploaded to. // Env: METRICS_BUCKET MetricsBucket string @@ -184,16 +190,17 @@ var Tests = struct { ServiceAccount string }{ - PollingTimeout: "tests.pollingTimeout", - GinkgoSkip: "tests.ginkgoSkip", - GinkgoFocus: "tests.focus", - TestsToRun: "tests.testsToRun", - SuppressSkipNotifications: "tests.suppressSkipNotifications", - CleanRuns: "tests.cleanRuns", - OperatorSkip: "tests.operatorSkip", - SkipClusterHealthChecks: "tests.skipClusterHealthChecks", - MetricsBucket: "tests.metricsBucket", - ServiceAccount: "tests.serviceAccount", + PollingTimeout: "tests.pollingTimeout", + GinkgoSkip: "tests.ginkgoSkip", + GinkgoFocus: "tests.focus", + TestsToRun: "tests.testsToRun", + SuppressSkipNotifications: "tests.suppressSkipNotifications", + CleanRuns: "tests.cleanRuns", + OperatorSkip: "tests.operatorSkip", + SkipClusterHealthChecks: "tests.skipClusterHealthChecks", + MetricsBucket: "tests.metricsBucket", + ServiceAccount: "tests.serviceAccount", + ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout", } // Cluster config keys. @@ -516,6 +523,9 @@ func init() { viper.SetDefault(Tests.SkipClusterHealthChecks, false) viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS") + viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h") + viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT") + viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics") viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET") From 8fef0ba6d0d6cf6df322324b9f5390742f7062a2 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Mon, 29 Mar 2021 13:17:50 -0400 Subject: [PATCH 10/15] feat: use healthcheck timeout knob in e2e suite Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index ae5b9d2c72..2487c72bf8 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -132,7 +132,12 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { if err != nil { log.Printf("Error generating Kube Clientset: %v\n", err) } - ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2) + duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) + if err != nil { + log.Printf("Failed parsing health check timeout, using 2 hours: %v", err) + duration = time.Hour * 2 + } + ctx, cancel := context.WithTimeout(context.Background(), duration) defer cancel() if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" { log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") From 359eae4b56ffee2c2c3bacd795ab3eb75d8ec441 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Mon, 29 Mar 2021 13:18:08 -0400 Subject: [PATCH 11/15] fix: only print usage warning when needed Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index 2487c72bf8..f6dd13b534 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -139,7 +139,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { } ctx, cancel := context.WithTimeout(context.Background(), duration) defer cancel() - if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" { + if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" { log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") } err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) From 6de49708d92896a497ff34eb78d408cf325e028d Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Tue, 30 Mar 2021 13:24:50 -0400 Subject: [PATCH 12/15] feat: fail beforesuite if duration parse fails Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index febf530326..ce13863438 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -134,8 +134,8 @@ func beforeSuite() bool { } duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) if err != nil { - log.Printf("Failed parsing health check timeout, using 2 hours: %v", err) - duration = time.Hour * 2 + log.Printf("Failed parsing health check timeout: %v", err) + return false } ctx, cancel := context.WithTimeout(context.Background(), duration) defer cancel() From 7dda5b8559cd3ce5a353195969d73d411a1a211c Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Tue, 30 Mar 2021 13:36:24 -0400 Subject: [PATCH 13/15] feat: repurpose old skip healthcheck flag to skip new ones Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index ce13863438..dae16b6306 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -124,33 +124,31 @@ func beforeSuite() bool { log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err) } - clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID()) - if err != nil { - log.Printf("Failed looking up cluster config for healthcheck: %v", err) - } - kubeClient, err := kubernetes.NewForConfig(clusterConfig) - if err != nil { - log.Printf("Error generating Kube Clientset: %v\n", err) - } - duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) - if err != nil { - log.Printf("Failed parsing health check timeout: %v", err) - return false - } - ctx, cancel := context.WithTimeout(context.Background(), duration) - defer cancel() - if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" { - log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.") - } - err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) - events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed) - if err != nil { - log.Printf("Cluster failed health check: %v", err) - getLogs() - return false + if viper.GetString(config.Tests.SkipClusterHealthChecks) != "true" { + clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID()) + if err != nil { + log.Printf("Failed looking up cluster config for healthcheck: %v", err) + } + kubeClient, err := kubernetes.NewForConfig(clusterConfig) + if err != nil { + log.Printf("Error generating Kube Clientset: %v\n", err) + } + duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) + if err != nil { + log.Printf("Failed parsing health check timeout: %v", err) + return false + } + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil) + events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed) + if err != nil { + log.Printf("Cluster failed health check: %v", err) + getLogs() + return false + } + log.Println("Cluster is healthy and ready for testing") } - log.Println("Cluster is healthy and ready for testing") - if len(viper.GetString(config.Addons.IDs)) > 0 { if viper.GetString(config.Provider) != "mock" { err = installAddons() From 71de62455016f08bb17577f863f388afcdf39f98 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Tue, 30 Mar 2021 13:40:43 -0400 Subject: [PATCH 14/15] feat: log when skipping health checks Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index dae16b6306..72eb15cf78 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -148,6 +148,8 @@ func beforeSuite() bool { return false } log.Println("Cluster is healthy and ready for testing") + } else { + log.Println("Skipping health checks as requested") } if len(viper.GetString(config.Addons.IDs)) > 0 { if viper.GetString(config.Provider) != "mock" { From e0e01435f9ca6870fbc049f31bbd04515ba43862 Mon Sep 17 00:00:00 2001 From: Chris Waldon Date: Tue, 30 Mar 2021 13:45:00 -0400 Subject: [PATCH 15/15] fix: actually fail healthcheck if any part fails Signed-off-by: Chris Waldon --- pkg/e2e/e2e.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go index 72eb15cf78..f6dced6da2 100644 --- a/pkg/e2e/e2e.go +++ b/pkg/e2e/e2e.go @@ -128,10 +128,12 @@ func beforeSuite() bool { clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID()) if err != nil { log.Printf("Failed looking up cluster config for healthcheck: %v", err) + return false } kubeClient, err := kubernetes.NewForConfig(clusterConfig) if err != nil { log.Printf("Error generating Kube Clientset: %v\n", err) + return false } duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout)) if err != nil {