openshift · openshift-merge-robot · Mar 30, 2021 · Mar 10, 2021 · Mar 10, 2021 · Mar 11, 2021
diff --git a/pkg/common/cluster/clusterutil.go b/pkg/common/cluster/clusterutil.go
@@ -207,41 +207,46 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
 	return nil
 }
 
-// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
-// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
-// 		assume we are running in a cluster and use in-cluster REST config instead.
-func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
-	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
-
-	logger.Print("Polling Cluster Health...\n")
-
-	var restConfig *rest.Config
-	var providerType string
-
+func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
 	if clusterID == "" {
 		if restConfig, err = rest.InClusterConfig(); err != nil {
-			logger.Printf("Error getting in-cluster REST config: %v\n", err)
-			return false, nil, nil
+			return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
 		}
 
 		// FIXME: Is there a way to discover this from within the cluster?
 		// For now, ocm and rosa behave the same, so hardcode either.
 		providerType = "ocm"
+		return
 
-	} else {
-		provider, err := providers.ClusterProvider()
+	}
+	provider, err := providers.ClusterProvider()
 
-		if err != nil {
-			return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
-		}
+	if err != nil {
+		return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
+	}
+	providerType = provider.Type()
 
-		restConfig, err = getRestConfig(provider, clusterID)
-		if err != nil {
-			logger.Printf("Error generating Rest Config: %v\n", err)
-			return false, nil, nil
-		}
+	restConfig, err = getRestConfig(provider, clusterID)
+	if err != nil {
+
+		return nil, "", fmt.Errorf("error generating rest config: %w", err)
+	}
+
+	return
+}
+
+// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
+// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
+// 		assume we are running in a cluster and use in-cluster REST config instead.
+func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
 
-		providerType = provider.Type()
+	logger.Print("Polling Cluster Health...\n")
+
+	restConfig, providerType, err := ClusterConfig(clusterID)
+	if err != nil {
+		logger.Printf("Error getting cluster config: %v\n", err)
+		return false, nil, nil
 	}
 
 	kubeClient, err := kubernetes.NewForConfig(restConfig)

diff --git a/pkg/common/cluster/healthchecks/healthcheckjob.go b/pkg/common/cluster/healthchecks/healthcheckjob.go
@@ -0,0 +1,67 @@
+package healthchecks
+
+import (
+	"context"
+	"fmt"
+	"log"
+
+	"github.com/openshift/osde2e/pkg/common/logging"
+	batchv1 "k8s.io/api/batch/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/watch"
+	"k8s.io/client-go/kubernetes"
+)
+
+// CheckHealthcheckJob uses the `openshift-cluster-ready-*` healthcheck job to determine cluster readiness.
+func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) (bool, error) {
+	logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)
+
+	logger.Print("Checking that all Nodes are running or completed...")
+
+	bv1C := k8sClient.BatchV1()
+	namespace := "openshift-monitoring"
+	name := "osd-cluster-ready"
+	jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return false, fmt.Errorf("failed listing jobs: %w", err)
+	}
+	for _, job := range jobs.Items {
+		if job.Name != name {
+			continue
+		}
+		if job.Status.Succeeded > 0 {
+			log.Println("Healthcheck job has already succeeded")
+			return true, nil
+		}
+		log.Println("Healthcheck job has not yet succeeded, watching...")
+	}
+	watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
+		ResourceVersion: jobs.ResourceVersion,
+		FieldSelector:   "metadata.name=osd-cluster-ready",
+	})
+	if err != nil {
+		return false, fmt.Errorf("failed watching job: %w", err)
+	}
+	for {
+		select {
+		case event := <-watcher.ResultChan():
+			switch event.Type {
+			case watch.Added:
+				fallthrough
+			case watch.Modified:
+				job := event.Object.(*batchv1.Job)
+				if job.Status.Succeeded > 0 {
+					return true, nil
+				}
+			case watch.Deleted:
+				return false, fmt.Errorf("cluster readiness job deleted before becoming ready")
+			case watch.Error:
+				return false, fmt.Errorf("watch returned error event: %v", event)
+			default:
+				logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
+			}
+		case <-ctx.Done():
+			return false, fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
+		}
+	}
+}
diff --git a/pkg/e2e/e2e.go b/pkg/e2e/e2e.go
@@ -4,6 +4,7 @@ package e2e
 import (
 	"bytes"
 	"compress/gzip"
+	"context"
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
@@ -19,6 +20,7 @@ import (
 	"github.com/hpcloud/tail"
 	junit "github.com/joshdk/go-junit"
 	vegeta "github.com/tsenart/vegeta/lib"
+	"k8s.io/client-go/kubernetes"
 
 	pd "github.com/PagerDuty/go-pagerduty"
 	"github.com/onsi/ginkgo"
@@ -32,6 +34,7 @@ import (
 	"github.com/openshift/osde2e/pkg/common/aws"
 	"github.com/openshift/osde2e/pkg/common/cluster"
 	clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
+	"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
 	"github.com/openshift/osde2e/pkg/common/clusterproperties"
 	"github.com/openshift/osde2e/pkg/common/config"
 	"github.com/openshift/osde2e/pkg/common/events"
@@ -121,7 +124,26 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 			log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
 		}
 
-		err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
+		clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
+		if err != nil {
+			log.Printf("Failed looking up cluster config for healthcheck: %v", err)
+		}
+		kubeClient, err := kubernetes.NewForConfig(clusterConfig)
+		if err != nil {
+			log.Printf("Error generating Kube Clientset: %v\n", err)
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), time.Hour*2)
+		defer cancel()
+		if viper.GetString(config.Tests.SkipClusterHealthChecks) != "" {
+			log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
+		}
+		ready, err := healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
+		if !ready && err == nil {
+			err = fmt.Errorf("Cluster not ready")
+		}
+		if ready {
+			log.Println("Cluster is healthy and ready for testing")
+		}
 		events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed).ShouldNot(HaveOccurred(), "cluster failed health check")
 		if err != nil {
 			getLogs()