Skip to content

Commit

Permalink
Merge pull request #743 from whereswaldon/SDCICD-577
Browse files Browse the repository at this point in the history
Use healthcheck cron job to determine cluster readiness
  • Loading branch information
openshift-merge-robot authored Mar 30, 2021
2 parents ed21e8a + e0e0143 commit 5b86061
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 39 deletions.
53 changes: 31 additions & 22 deletions pkg/common/cluster/clusterutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,41 +214,50 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
return nil
}

// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
// ClusterConfig returns the rest API config for a given cluster as well as the provider it
// inferred to discover the config.
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
// assume we are running in a cluster and use in-cluster REST config instead.
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Polling Cluster Health...\n")

var restConfig *rest.Config
var providerType string

func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
if clusterID == "" {
if restConfig, err = rest.InClusterConfig(); err != nil {
logger.Printf("Error getting in-cluster REST config: %v\n", err)
return false, nil, nil
return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
}

// FIXME: Is there a way to discover this from within the cluster?
// For now, ocm and rosa behave the same, so hardcode either.
providerType = "ocm"
return

} else {
provider, err := providers.ClusterProvider()
}
provider, err := providers.ClusterProvider()

if err != nil {
return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
}
if err != nil {
return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
}
providerType = provider.Type()

restConfig, err = getRestConfig(provider, clusterID)
if err != nil {
logger.Printf("Error generating Rest Config: %v\n", err)
return false, nil, nil
}
restConfig, err = getRestConfig(provider, clusterID)
if err != nil {

return nil, "", fmt.Errorf("error generating rest config: %w", err)
}

return
}

providerType = provider.Type()
// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
// assume we are running in a cluster and use in-cluster REST config instead.
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Polling Cluster Health...\n")

restConfig, providerType, err := ClusterConfig(clusterID)
if err != nil {
logger.Printf("Error getting cluster config: %v\n", err)
return false, nil, nil
}

kubeClient, err := kubernetes.NewForConfig(restConfig)
Expand Down
71 changes: 71 additions & 0 deletions pkg/common/cluster/healthchecks/healthcheckjob.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package healthchecks

import (
"context"
"fmt"
"log"

"github.com/openshift/osde2e/pkg/common/logging"
batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
)

// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster
// is not ready, it will return an error.
func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Checking whether cluster is healthy before proceeding...")

bv1C := k8sClient.BatchV1()
namespace := "openshift-monitoring"
name := "osd-cluster-ready"
jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed listing jobs: %w", err)
}
for _, job := range jobs.Items {
if job.Name != name {
continue
}
if job.Status.Succeeded > 0 {
log.Println("Healthcheck job has already succeeded")
return nil
}
log.Println("Healthcheck job has not yet succeeded, watching...")
}
watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
ResourceVersion: jobs.ResourceVersion,
FieldSelector: "metadata.name=osd-cluster-ready",
})
if err != nil {
return fmt.Errorf("failed watching job: %w", err)
}
for {
select {
case event := <-watcher.ResultChan():
switch event.Type {
case watch.Added:
fallthrough
case watch.Modified:
job := event.Object.(*batchv1.Job)
if job.Status.Succeeded > 0 {
return nil
}
if job.Status.Failed > 0 {
return fmt.Errorf("cluster readiness job failed")
}
case watch.Deleted:
return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)")
case watch.Error:
return fmt.Errorf("watch returned error event: %v", event)
default:
logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
}
case <-ctx.Done():
return fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
}
}
}
30 changes: 20 additions & 10 deletions pkg/common/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,12 @@ var Tests = struct {
// Env: SKIP_CLUSTER_HEALTH_CHECKS
SkipClusterHealthChecks string

// ClusterHealthChecksTimeout defines the duration for which the harness will
// wait for the cluster to indicate it is healthy before cancelling the test
// run. This value should be formatted for use with time.ParseDuration.
// Env: CLUSTER_HEALTH_CHECKS_TIMEOUT
ClusterHealthChecksTimeout string

// MetricsBucket is the bucket that metrics data will be uploaded to.
// Env: METRICS_BUCKET
MetricsBucket string
Expand All @@ -184,16 +190,17 @@ var Tests = struct {
ServiceAccount string
}{

PollingTimeout: "tests.pollingTimeout",
GinkgoSkip: "tests.ginkgoSkip",
GinkgoFocus: "tests.focus",
TestsToRun: "tests.testsToRun",
SuppressSkipNotifications: "tests.suppressSkipNotifications",
CleanRuns: "tests.cleanRuns",
OperatorSkip: "tests.operatorSkip",
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
MetricsBucket: "tests.metricsBucket",
ServiceAccount: "tests.serviceAccount",
PollingTimeout: "tests.pollingTimeout",
GinkgoSkip: "tests.ginkgoSkip",
GinkgoFocus: "tests.focus",
TestsToRun: "tests.testsToRun",
SuppressSkipNotifications: "tests.suppressSkipNotifications",
CleanRuns: "tests.cleanRuns",
OperatorSkip: "tests.operatorSkip",
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
MetricsBucket: "tests.metricsBucket",
ServiceAccount: "tests.serviceAccount",
ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout",
}

// Cluster config keys.
Expand Down Expand Up @@ -527,6 +534,9 @@ func init() {
viper.SetDefault(Tests.SkipClusterHealthChecks, false)
viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS")

viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h")
viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT")

viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics")
viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET")

Expand Down
38 changes: 31 additions & 7 deletions pkg/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package e2e
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"encoding/xml"
"fmt"
Expand All @@ -19,6 +20,7 @@ import (
"github.com/hpcloud/tail"
junit "github.com/joshdk/go-junit"
vegeta "github.com/tsenart/vegeta/lib"
"k8s.io/client-go/kubernetes"

pd "github.com/PagerDuty/go-pagerduty"
"github.com/onsi/ginkgo"
Expand All @@ -31,6 +33,7 @@ import (
"github.com/openshift/osde2e/pkg/common/aws"
"github.com/openshift/osde2e/pkg/common/cluster"
clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
"github.com/openshift/osde2e/pkg/common/clusterproperties"
"github.com/openshift/osde2e/pkg/common/config"
"github.com/openshift/osde2e/pkg/common/events"
Expand Down Expand Up @@ -121,14 +124,35 @@ func beforeSuite() bool {
log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
}

err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
if err != nil {
log.Printf("Cluster failed health check: %v", err)
getLogs()
return false
if viper.GetString(config.Tests.SkipClusterHealthChecks) != "true" {
clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
if err != nil {
log.Printf("Failed looking up cluster config for healthcheck: %v", err)
return false
}
kubeClient, err := kubernetes.NewForConfig(clusterConfig)
if err != nil {
log.Printf("Error generating Kube Clientset: %v\n", err)
return false
}
duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
if err != nil {
log.Printf("Failed parsing health check timeout: %v", err)
return false
}
ctx, cancel := context.WithTimeout(context.Background(), duration)
defer cancel()
err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
if err != nil {
log.Printf("Cluster failed health check: %v", err)
getLogs()
return false
}
log.Println("Cluster is healthy and ready for testing")
} else {
log.Println("Skipping health checks as requested")
}

if len(viper.GetString(config.Addons.IDs)) > 0 {
if viper.GetString(config.Provider) != "mock" {
err = installAddons()
Expand Down

0 comments on commit 5b86061

Please sign in to comment.