From 26f7164702f5759159f7e3e6439d1fb4002b0072 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Wed, 10 Mar 2021 13:50:58 -0500
Subject: [PATCH 01/15] feat: provide a healthcheck that replies on cron job

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 .../cluster/healthchecks/healthcheckjob.go    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 pkg/common/cluster/healthchecks/healthcheckjob.go

diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
new file mode 100644
index 0000000000..4c327edc43
--- /dev/null
+++ b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -0,0 +1,58 @@
+package healthchecks
+
+import (
+	"context"
+	"fmt"
+	"log"
+
+	"github.com/openshift/osde2e/pkg/common/logging"
+	batchv1 "k8s.io/api/batch/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/watch"
+	batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1"
+	v1 "k8s.io/client-go/kubernetes/typed/core/v1"
+)
+
+// CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness.
+func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logger *log.Logger) (bool, error) {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
+
+	logger.Print("Checking that all Nodes are running or completed...")
+
+	bv1C := batchv1client.New(k8sClient.RESTClient())
+	watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{
+		Watch:         true,
+		FieldSelector: "metadata.name=osd-cluster-ready",
+	})
+	if err != nil {
+		if errors.IsNotFound(err) {
+			// Job doesn't exist yet
+			return false, nil
+		}
+		// Failed checking for job
+		return false, fmt.Errorf("failed looking up job: %w", err)
+	}
+	for {
+		select {
+		case event := <-watcher.ResultChan():
+			switch event.Type {
+			case watch.Added:
+				fallthrough
+			case watch.Modified:
+				job := event.Object.(*batchv1.Job)
+				if job.Status.Succeeded > 0 {
+					return true, nil
+				}
+			case watch.Deleted:
+				return false, fmt.Errorf("cluster readiness job deleted before becoming ready")
+			case watch.Error:
+				return false, fmt.Errorf("watch returned error event: %v", event)
+			default:
+				logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
+			}
+		case <-ctx.Done():
+			return false, fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
+		}
+	}
+}

From fd7fb75620355ed349939fc89d317222dd64da3d Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Wed, 10 Mar 2021 15:48:00 -0500
Subject: [PATCH 02/15] wip: attempt to watch healthcheck job to determine
 cluster readiness

This currently always fails, and I really have no idea why.

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/common/cluster/clusterutil.go             | 53 ++++++++++---------
 .../cluster/healthchecks/healthcheckjob.go    | 10 ++--
 pkg/e2e/e2e.go                                | 18 ++++++-
 3 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go
index e984060a14..57548d3021 100644
--- a/pkg/common/cluster/clusterutil.go
+++ b/pkg/common/cluster/clusterutil.go
@@ -207,41 +207,46 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
 	return nil
 }
 
-// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
-// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
-// 		assume we are running in a cluster and use in-cluster REST config instead.
-func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
-	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
-
-	logger.Print("Polling Cluster Health...\n")
-
-	var restConfig *rest.Config
-	var providerType string
-
+func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
 	if clusterID == "" {
 		if restConfig, err = rest.InClusterConfig(); err != nil {
-			logger.Printf("Error getting in-cluster REST config: %v\n", err)
-			return false, nil, nil
+			return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
 		}
 
 		// FIXME: Is there a way to discover this from within the cluster?
 		// For now, ocm and rosa behave the same, so hardcode either.
 		providerType = "ocm"
+		return
 
-	} else {
-		provider, err := providers.ClusterProvider()
+	}
+	provider, err := providers.ClusterProvider()
 
-		if err != nil {
-			return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
-		}
+	if err != nil {
+		return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
+	}
+	providerType = provider.Type()
 
-		restConfig, err = getRestConfig(provider, clusterID)
-		if err != nil {
-			logger.Printf("Error generating Rest Config: %v\n", err)
-			return false, nil, nil
-		}
+	restConfig, err = getRestConfig(provider, clusterID)
+	if err != nil {
+
+		return nil, "", fmt.Errorf("error generating rest config: %w", err)
+	}
+
+	return
+}
+
+// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
+// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
+// 		assume we are running in a cluster and use in-cluster REST config instead.
+func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
 
-		providerType = provider.Type()
+	logger.Print("Polling Cluster Health...\n")
+
+	restConfig, providerType, err := ClusterConfig(clusterID)
+	if err != nil {
+		logger.Printf("Error getting cluster config: %v\n", err)
+		return false, nil, nil
 	}
 
 	kubeClient, err := kubernetes.NewForConfig(restConfig)
diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
index 4c327edc43..4d127b4a52 100644
--- a/pkg/common/cluster/healthchecks/healthcheckjob.go
+++ b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -7,7 +7,6 @@ import (
 
 	"github.com/openshift/osde2e/pkg/common/logging"
 	batchv1 "k8s.io/api/batch/v1"
-	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/watch"
 	batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1"
@@ -22,14 +21,11 @@ func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logg
 
 	bv1C := batchv1client.New(k8sClient.RESTClient())
 	watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{
-		Watch:         true,
-		FieldSelector: "metadata.name=osd-cluster-ready",
+		Watch:           true,
+		FieldSelector:   "metadata.name=osd-cluster-ready",
+		ResourceVersion: "0",
 	})
 	if err != nil {
-		if errors.IsNotFound(err) {
-			// Job doesn't exist yet
-			return false, nil
-		}
 		// Failed checking for job
 		return false, fmt.Errorf("failed looking up job: %w", err)
 	}
diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index 60da6a1dba..fbaa9ec80a 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -4,6 +4,7 @@ package e2e
 import (
 	"bytes"
 	"compress/gzip"
+	"context"
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
@@ -19,6 +20,7 @@ import (
 	"github.com/hpcloud/tail"
 	junit "github.com/joshdk/go-junit"
 	vegeta "github.com/tsenart/vegeta/lib"
+	"k8s.io/client-go/kubernetes"
 
 	pd "github.com/PagerDuty/go-pagerduty"
 	"github.com/onsi/ginkgo"
@@ -32,6 +34,7 @@ import (
 	"github.com/openshift/osde2e/pkg/common/aws"
 	"github.com/openshift/osde2e/pkg/common/cluster"
 	clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
+	"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
 	"github.com/openshift/osde2e/pkg/common/clusterproperties"
 	"github.com/openshift/osde2e/pkg/common/config"
 	"github.com/openshift/osde2e/pkg/common/events"
@@ -121,7 +124,20 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 			log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
 		}
 
-		err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
+		clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
+		if err != nil {
+			log.Printf("Failed looking up cluster config for healthcheck: %v", err)
+		}
+		kubeClient, err := kubernetes.NewForConfig(clusterConfig)
+		if err != nil {
+			log.Printf("Error generating Kube Clientset: %v\n", err)
+		}
+
+		ctx, _ := context.WithTimeout(context.Background(), time.Hour*2)
+		ready, err := healthchecks.CheckHealthcheckJob(kubeClient.CoreV1(), ctx, nil)
+		if !ready && err == nil {
+			err = fmt.Errorf("Cluster not ready")
+		}
 		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check")
 		if err != nil {
 			getLogs()

From f8f10e6129d18f9a33196f7aa87ddae8f27a868f Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Thu, 11 Mar 2021 15:55:42 -0500
Subject: [PATCH 03/15] feat: fix watch usage and pass proper client

Previously, everything was failing because I was passing the wrong
instance of rest.Interface into the function (the one powering
the CoreV1() api is not the same as the one for BatchV1()).
That mistake caused the problems I was encountering in the past.

I have also rewritten the usage of Watch() to properly provide
a resourceversion so that we see all changes since the initial
list operation.

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 .../cluster/healthchecks/healthcheckjob.go    | 31 +++++++++++++------
 pkg/e2e/e2e.go                                |  3 +-
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
index 4d127b4a52..00e1e85dc9 100644
--- a/pkg/common/cluster/healthchecks/healthcheckjob.go
+++ b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -9,25 +9,38 @@ import (
 	batchv1 "k8s.io/api/batch/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/watch"
-	batchv1client "k8s.io/client-go/kubernetes/typed/batch/v1"
-	v1 "k8s.io/client-go/kubernetes/typed/core/v1"
+	"k8s.io/client-go/kubernetes"
 )
 
 // CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness.
-func CheckHealthcheckJob(k8sClient v1.CoreV1Interface, ctx context.Context, logger *log.Logger) (bool, error) {
+func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) {
 	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
 
 	logger.Print("Checking that all Nodes are running or completed...")
 
-	bv1C := batchv1client.New(k8sClient.RESTClient())
-	watcher, err := bv1C.Jobs("openshift-monitoring").Watch(ctx, metav1.ListOptions{
-		Watch:           true,
+	bv1C := k8sClient.BatchV1()
+	namespace := "openshift-monitoring"
+	name := "osd-cluster-ready"
+	jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return false, fmt.Errorf("failed listing jobs: %w", err)
+	}
+	for _, job := range jobs.Items {
+		if job.Name != name {
+			continue
+		}
+		if job.Status.Succeeded > 0 {
+			log.Println("Healthcheck job has already succeeded")
+			return true, nil
+		}
+		log.Println("Healthcheck job has not yet succeeded, watching...")
+	}
+	watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
+		ResourceVersion: jobs.ResourceVersion,
 		FieldSelector:   "metadata.name=osd-cluster-ready",
-		ResourceVersion: "0",
 	})
 	if err != nil {
-		// Failed checking for job
-		return false, fmt.Errorf("failed looking up job: %w", err)
+		return false, fmt.Errorf("failed watching job: %w", err)
 	}
 	for {
 		select {
diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index fbaa9ec80a..a472446c8f 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -132,9 +132,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		if err != nil {
 			log.Printf("Error generating Kube Clientset: %v\n", err)
 		}
-
 		ctx, _ := context.WithTimeout(context.Background(), time.Hour*2)
-		ready, err := healthchecks.CheckHealthcheckJob(kubeClient.CoreV1(), ctx, nil)
+		ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
 		if !ready && err == nil {
 			err = fmt.Errorf("Cluster not ready")
 		}

From 420e4f13f1945f1b9bd7b1414a2fa28b8c169deb Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Fri, 12 Mar 2021 09:28:42 -0500
Subject: [PATCH 04/15] feat: log when health check passes

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index a472446c8f..d1e938ad64 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -137,6 +137,9 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		if !ready && err == nil {
 			err = fmt.Errorf("Cluster not ready")
 		}
+		if ready {
+			log.Println("Cluster is healthy and ready for testing")
+		}
 		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check")
 		if err != nil {
 			getLogs()

From cd7b67e88191a432b75ff4c70aba34757d710f7d Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Fri, 12 Mar 2021 10:12:25 -0500
Subject: [PATCH 05/15] fix: ensure we cancel context to avoid leak

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index d1e938ad64..82021c2068 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -132,7 +132,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		if err != nil {
 			log.Printf("Error generating Kube Clientset: %v\n", err)
 		}
-		ctx, _ := context.WithTimeout(context.Background(), time.Hour*2)
+		ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2)
+		defer cancel()
 		ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
 		if !ready && err == nil {
 			err = fmt.Errorf("Cluster not ready")

From c6dc076c09b82e9bed6aad9b3b7f6a0a4179760a Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Mon, 15 Mar 2021 12:23:08 -0400
Subject: [PATCH 06/15] feat: warn that skip health checks config is a no-op

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index 82021c2068..995afc742c 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -134,6 +134,9 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		}
 		ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2)
 		defer cancel()
+		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" {
+			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
+		}
 		ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
 		if !ready && err == nil {
 			err = fmt.Errorf("Cluster not ready")

From 2c1a44cadfa29902a60a5611b79ecc6188c8c330 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Tue, 16 Mar 2021 14:39:20 -0400
Subject: [PATCH 07/15] fix: improve docs and log output as per code review

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/common/cluster/clusterutil.go                 | 4 ++++
 pkg/common/cluster/healthchecks/healthcheckjob.go | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go
index 57548d3021..5be6245549 100644
--- a/pkg/common/cluster/clusterutil.go
+++ b/pkg/common/cluster/clusterutil.go
@@ -207,6 +207,10 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
 	return nil
 }
 
+// ClusterConfig returns the rest API config for a given cluster as well as the provider it
+// inferred to discover the config.
+// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
+// 		assume we are running in a cluster and use in-cluster REST config instead.
 func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
 	if clusterID == "" {
 		if restConfig, err = rest.InClusterConfig(); err != nil {
diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
index 00e1e85dc9..f94d31d3b0 100644
--- a/pkg/common/cluster/healthchecks/healthcheckjob.go
+++ b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -12,11 +12,11 @@ import (
 	"k8s.io/client-go/kubernetes"
 )
 
-// CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness.
+// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness.
 func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) {
 	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
 
-	logger.Print("Checking that all Nodes are running or completed...")
+	logger.Print("Checking whether cluster is healthy before proceeding...")
 
 	bv1C := k8sClient.BatchV1()
 	namespace := "openshift-monitoring"

From 1325f9e9e3b0d4bb0a15034613beb7ab75ccb76e Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Mon, 22 Mar 2021 14:16:57 -0400
Subject: [PATCH 08/15] feat: update job monitoring logic to match new expected
 behavior

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 .../cluster/healthchecks/healthcheckjob.go    | 22 +++++++++++--------
 pkg/e2e/e2e.go                                |  9 ++------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
index f94d31d3b0..4ab007fd43 100644
--- a/pkg/common/cluster/healthchecks/healthcheckjob.go
+++ b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -12,8 +12,9 @@ import (
 	"k8s.io/client-go/kubernetes"
 )
 
-// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness.
-func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) {
+// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster
+// is not ready, it will return an error.
+func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error {
 	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
 
 	logger.Print("Checking whether cluster is healthy before proceeding...")
@@ -23,7 +24,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l
 	name := "osd-cluster-ready"
 	jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
 	if err != nil {
-		return false, fmt.Errorf("failed listing jobs: %w", err)
+		return fmt.Errorf("failed listing jobs: %w", err)
 	}
 	for _, job := range jobs.Items {
 		if job.Name != name {
@@ -31,7 +32,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l
 		}
 		if job.Status.Succeeded > 0 {
 			log.Println("Healthcheck job has already succeeded")
-			return true, nil
+			return nil
 		}
 		log.Println("Healthcheck job has not yet succeeded, watching...")
 	}
@@ -40,7 +41,7 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l
 		FieldSelector:   "metadata.name=osd-cluster-ready",
 	})
 	if err != nil {
-		return false, fmt.Errorf("failed watching job: %w", err)
+		return fmt.Errorf("failed watching job: %w", err)
 	}
 	for {
 		select {
@@ -51,17 +52,20 @@ func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, l
 			case watch.Modified:
 				job := event.Object.(*batchv1.Job)
 				if job.Status.Succeeded > 0 {
-					return true, nil
+					return nil
+				}
+				if job.Status.Failed > 0 {
+					return fmt.Errorf("cluster readiness job failed")
 				}
 			case watch.Deleted:
-				return false, fmt.Errorf("cluster readiness job deleted before becoming ready")
+				return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)")
 			case watch.Error:
-				return false, fmt.Errorf("watch returned error event: %v", event)
+				return fmt.Errorf("watch returned error event: %v", event)
 			default:
 				logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
 			}
 		case <-ctx.Done():
-			return false, fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
+			return fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
 		}
 	}
 }
diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index 995afc742c..ae5b9d2c72 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -137,18 +137,13 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" {
 			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
 		}
-		ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
-		if !ready && err == nil {
-			err = fmt.Errorf("Cluster not ready")
-		}
-		if ready {
-			log.Println("Cluster is healthy and ready for testing")
-		}
+		err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
 		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check")
 		if err != nil {
 			getLogs()
 			return []byte{}
 		}
+		log.Println("Cluster is healthy and ready for testing")
 
 		if len(viper.GetString(config.Addons.IDs)) > 0 {
 			if viper.GetString(config.Provider) != "mock" {

From 0d93eecfce899141c60b04869c819fbe7733db13 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Mon, 29 Mar 2021 13:17:15 -0400
Subject: [PATCH 09/15] feat: add config knob for healthcheck timeout

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/common/config/config.go | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/pkg/common/config/config.go b/pkg/common/config/config.go
index 1d393092c7..33e24d312f 100644
--- a/pkg/common/config/config.go
+++ b/pkg/common/config/config.go
@@ -175,6 +175,12 @@ var Tests = struct {
 	// Env: SKIP_CLUSTER_HEALTH_CHECKS
 	SkipClusterHealthChecks string
 
+	// ClusterHealthChecksTimeout defines the duration for which the harness will
+	// wait for the cluster to indicate it is healthy before cancelling the test
+	// run. This value should be formatted for use with time.ParseDuration.
+	// Env: CLUSTER_HEALTH_CHECKS_TIMEOUT
+	ClusterHealthChecksTimeout string
+
 	// MetricsBucket is the bucket that metrics data will be uploaded to.
 	// Env: METRICS_BUCKET
 	MetricsBucket string
@@ -184,16 +190,17 @@ var Tests = struct {
 	ServiceAccount string
 }{
 
-	PollingTimeout:            "tests.pollingTimeout",
-	GinkgoSkip:                "tests.ginkgoSkip",
-	GinkgoFocus:               "tests.focus",
-	TestsToRun:                "tests.testsToRun",
-	SuppressSkipNotifications: "tests.suppressSkipNotifications",
-	CleanRuns:                 "tests.cleanRuns",
-	OperatorSkip:              "tests.operatorSkip",
-	SkipClusterHealthChecks:   "tests.skipClusterHealthChecks",
-	MetricsBucket:             "tests.metricsBucket",
-	ServiceAccount:            "tests.serviceAccount",
+	PollingTimeout:             "tests.pollingTimeout",
+	GinkgoSkip:                 "tests.ginkgoSkip",
+	GinkgoFocus:                "tests.focus",
+	TestsToRun:                 "tests.testsToRun",
+	SuppressSkipNotifications:  "tests.suppressSkipNotifications",
+	CleanRuns:                  "tests.cleanRuns",
+	OperatorSkip:               "tests.operatorSkip",
+	SkipClusterHealthChecks:    "tests.skipClusterHealthChecks",
+	MetricsBucket:              "tests.metricsBucket",
+	ServiceAccount:             "tests.serviceAccount",
+	ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout",
 }
 
 // Cluster config keys.
@@ -516,6 +523,9 @@ func init() {
 	viper.SetDefault(Tests.SkipClusterHealthChecks, false)
 	viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS")
 
+	viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h")
+	viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT")
+
 	viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics")
 	viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET")
 

From 8fef0ba6d0d6cf6df322324b9f5390742f7062a2 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Mon, 29 Mar 2021 13:17:50 -0400
Subject: [PATCH 10/15] feat: use healthcheck timeout knob in e2e suite

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index ae5b9d2c72..2487c72bf8 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -132,7 +132,12 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		if err != nil {
 			log.Printf("Error generating Kube Clientset: %v\n", err)
 		}
-		ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2)
+		duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
+		if err != nil {
+			log.Printf("Failed parsing health check timeout, using 2 hours: %v", err)
+			duration = time.Hour * 2
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), duration)
 		defer cancel()
 		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" {
 			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")

From 359eae4b56ffee2c2c3bacd795ab3eb75d8ec441 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Mon, 29 Mar 2021 13:18:08 -0400
Subject: [PATCH 11/15] fix: only print usage warning when needed

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index 2487c72bf8..f6dd13b534 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -139,7 +139,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		}
 		ctx, cancel := context.WithTimeout(context.Background(), duration)
 		defer cancel()
-		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" {
+		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" {
 			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
 		}
 		err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)

From 6de49708d92896a497ff34eb78d408cf325e028d Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Tue, 30 Mar 2021 13:24:50 -0400
Subject: [PATCH 12/15] feat: fail beforesuite if duration parse fails

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index febf530326..ce13863438 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -134,8 +134,8 @@ func beforeSuite() bool {
 		}
 		duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
 		if err != nil {
-			log.Printf("Failed parsing health check timeout, using 2 hours: %v", err)
-			duration = time.Hour * 2
+			log.Printf("Failed parsing health check timeout: %v", err)
+			return false
 		}
 		ctx, cancel := context.WithTimeout(context.Background(), duration)
 		defer cancel()

From 7dda5b8559cd3ce5a353195969d73d411a1a211c Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Tue, 30 Mar 2021 13:36:24 -0400
Subject: [PATCH 13/15] feat: repurpose old skip healthcheck flag to skip new
 ones

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index ce13863438..dae16b6306 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -124,33 +124,31 @@ func beforeSuite() bool {
 			log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
 		}
 
-		clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
-		if err != nil {
-			log.Printf("Failed looking up cluster config for healthcheck: %v", err)
-		}
-		kubeClient, err := kubernetes.NewForConfig(clusterConfig)
-		if err != nil {
-			log.Printf("Error generating Kube Clientset: %v\n", err)
-		}
-		duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
-		if err != nil {
-			log.Printf("Failed parsing health check timeout: %v", err)
-			return false
-		}
-		ctx, cancel := context.WithTimeout(context.Background(), duration)
-		defer cancel()
-		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" {
-			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
-		}
-		err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
-		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
-		if err != nil {
-			log.Printf("Cluster failed health check: %v", err)
-			getLogs()
-			return false
+		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "true" {
+			clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
+			if err != nil {
+				log.Printf("Failed looking up cluster config for healthcheck: %v", err)
+			}
+			kubeClient, err := kubernetes.NewForConfig(clusterConfig)
+			if err != nil {
+				log.Printf("Error generating Kube Clientset: %v\n", err)
+			}
+			duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
+			if err != nil {
+				log.Printf("Failed parsing health check timeout: %v", err)
+				return false
+			}
+			ctx, cancel := context.WithTimeout(context.Background(), duration)
+			defer cancel()
+			err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
+			events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
+			if err != nil {
+				log.Printf("Cluster failed health check: %v", err)
+				getLogs()
+				return false
+			}
+			log.Println("Cluster is healthy and ready for testing")
 		}
-		log.Println("Cluster is healthy and ready for testing")
-
 		if len(viper.GetString(config.Addons.IDs)) > 0 {
 			if viper.GetString(config.Provider) != "mock" {
 				err = installAddons()

From 71de62455016f08bb17577f863f388afcdf39f98 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Tue, 30 Mar 2021 13:40:43 -0400
Subject: [PATCH 14/15] feat: log when skipping health checks

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index dae16b6306..72eb15cf78 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -148,6 +148,8 @@ func beforeSuite() bool {
 				return false
 			}
 			log.Println("Cluster is healthy and ready for testing")
+		} else {
+			log.Println("Skipping health checks as requested")
 		}
 		if len(viper.GetString(config.Addons.IDs)) > 0 {
 			if viper.GetString(config.Provider) != "mock" {

From e0e01435f9ca6870fbc049f31bbd04515ba43862 Mon Sep 17 00:00:00 2001
From: Chris Waldon <cwaldon@redhat.com>
Date: Tue, 30 Mar 2021 13:45:00 -0400
Subject: [PATCH 15/15] fix: actually fail healthcheck if any part fails

Signed-off-by: Chris Waldon <cwaldon@redhat.com>
---
 pkg/e2e/e2e.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
index 72eb15cf78..f6dced6da2 100644
--- a/pkg/e2e/e2e.go
+++ b/pkg/e2e/e2e.go
@@ -128,10 +128,12 @@ func beforeSuite() bool {
 			clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
 			if err != nil {
 				log.Printf("Failed looking up cluster config for healthcheck: %v", err)
+				return false
 			}
 			kubeClient, err := kubernetes.NewForConfig(clusterConfig)
 			if err != nil {
 				log.Printf("Error generating Kube Clientset: %v\n", err)
+				return false
 			}
 			duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
 			if err != nil {