Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use healthcheck cron job to determine cluster readiness #743

Merged
merged 16 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 31 additions & 22 deletions pkg/common/cluster/clusterutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,41 +214,50 @@ func waitForClusterReadyWithOverrideAndExpectedNumberOfNodes(clusterID string, l
return nil
}

// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
// ClusterConfig returns the rest API config for a given cluster as well as the provider it
// inferred to discover the config.
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
// assume we are running in a cluster and use in-cluster REST config instead.
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Polling Cluster Health...\n")

var restConfig *rest.Config
var providerType string

func ClusterConfig(clusterID string) (restConfig *rest.Config, providerType string, err error) {
if clusterID == "" {
if restConfig, err = rest.InClusterConfig(); err != nil {
logger.Printf("Error getting in-cluster REST config: %v\n", err)
return false, nil, nil
return nil, "", fmt.Errorf("error getting in-cluster rest config: %w", err)
}

// FIXME: Is there a way to discover this from within the cluster?
// For now, ocm and rosa behave the same, so hardcode either.
providerType = "ocm"
return

} else {
provider, err := providers.ClusterProvider()
}
provider, err := providers.ClusterProvider()

if err != nil {
return false, nil, fmt.Errorf("error getting cluster provisioning client: %v", err)
}
if err != nil {
return nil, "", fmt.Errorf("error getting cluster provisioning client: %w", err)
}
providerType = provider.Type()

restConfig, err = getRestConfig(provider, clusterID)
if err != nil {
logger.Printf("Error generating Rest Config: %v\n", err)
return false, nil, nil
}
restConfig, err = getRestConfig(provider, clusterID)
if err != nil {

return nil, "", fmt.Errorf("error generating rest config: %w", err)
}

return
}

providerType = provider.Type()
// PollClusterHealth looks at CVO data to determine if a cluster is alive/healthy or not
// param clusterID: If specified, Provider will be discovered through OCM. If the empty string,
// assume we are running in a cluster and use in-cluster REST config instead.
func PollClusterHealth(clusterID string, logger *log.Logger) (status bool, failures []string, err error) {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Polling Cluster Health...\n")

restConfig, providerType, err := ClusterConfig(clusterID)
if err != nil {
logger.Printf("Error getting cluster config: %v\n", err)
return false, nil, nil
}

kubeClient, err := kubernetes.NewForConfig(restConfig)
Expand Down
71 changes: 71 additions & 0 deletions pkg/common/cluster/healthchecks/healthcheckjob.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package healthchecks

import (
"context"
"fmt"
"log"

"github.com/openshift/osde2e/pkg/common/logging"
batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
)

// CheckHealthcheckJob uses the `osd-cluster-ready` healthcheck job to determine cluster readiness. If the cluster
// is not ready, it will return an error.
func CheckHealthcheckJob(k8sClient *kubernetes.Clientset, ctx context.Context, logger *log.Logger) error {
logger = logging.CreateNewStdLoggerOrUseExistingLogger(logger)

logger.Print("Checking whether cluster is healthy before proceeding...")

bv1C := k8sClient.BatchV1()
namespace := "openshift-monitoring"
name := "osd-cluster-ready"
jobs, err := bv1C.Jobs(namespace).List(ctx, metav1.ListOptions{})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super familiar with client-go, but why doesn't bv1C.Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) work here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sigh... It does. I just did it a dumb way because I too am not very familiar with client-go.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait! On second thought, it does need to be a list. I need the resourceVersion field of the list itself in order to initiate the watch later. This allows me to detect the creation and deletion of jobs matching my criteria since I performed the list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I understand.

If you use a zero resourceVersion, do you even need the initial get/list?

if err != nil {
return fmt.Errorf("failed listing jobs: %w", err)
}
for _, job := range jobs.Items {
if job.Name != name {
continue
}
if job.Status.Succeeded > 0 {
log.Println("Healthcheck job has already succeeded")
return nil
}
log.Println("Healthcheck job has not yet succeeded, watching...")
}
watcher, err := bv1C.Jobs(namespace).Watch(ctx, metav1.ListOptions{
ResourceVersion: jobs.ResourceVersion,
FieldSelector: "metadata.name=osd-cluster-ready",
})
if err != nil {
return fmt.Errorf("failed watching job: %w", err)
}
for {
select {
case event := <-watcher.ResultChan():
switch event.Type {
case watch.Added:
fallthrough
case watch.Modified:
job := event.Object.(*batchv1.Job)
if job.Status.Succeeded > 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to handle "completed but failed" as well.

...But how we handle it is different before vs after openshift/configure-alertmanager-operator#143

  • Before: Failed Job means you fail here.
  • After: Failed Job will be deleted and reinstated (see below).

Need to think through this some more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll wait to hear your updated thoughts on how this should work before altering this logic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Talked this through with @jharrington22 just now. We're going to rework the readiness side so the picture is clear. Specific to this case, I think for osde2e's purposes "completed but failed" will translate to "cluster not ready, ain't ever gonna be".

But I'm also pretty sure we're going to use prometheus to carry that state... which would mean substantial changes in this PR. Stay tuned -- I hope to have something written up by tomorrow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep an eye on OSD-6646 starting here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Completed but failed" should be treated as "cluster not ready, ain't ever gonna be". I'll post more info in a top-level comment.

return nil
}
if job.Status.Failed > 0 {
return fmt.Errorf("cluster readiness job failed")
}
case watch.Deleted:
return fmt.Errorf("cluster readiness job deleted before becoming ready (this should never happen)")
case watch.Error:
return fmt.Errorf("watch returned error event: %v", event)
default:
logger.Printf("Unrecognized event type while watching for healthcheck job updates: %v", event.Type)
}
case <-ctx.Done():
return fmt.Errorf("healtcheck watch context cancelled while still waiting for success")
}
}
}
30 changes: 20 additions & 10 deletions pkg/common/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,12 @@ var Tests = struct {
// Env: SKIP_CLUSTER_HEALTH_CHECKS
SkipClusterHealthChecks string

// ClusterHealthChecksTimeout defines the duration for which the harness will
// wait for the cluster to indicate it is healthy before cancelling the test
// run. This value should be formatted for use with time.ParseDuration.
// Env: CLUSTER_HEALTH_CHECKS_TIMEOUT
ClusterHealthChecksTimeout string

// MetricsBucket is the bucket that metrics data will be uploaded to.
// Env: METRICS_BUCKET
MetricsBucket string
Expand All @@ -184,16 +190,17 @@ var Tests = struct {
ServiceAccount string
}{

PollingTimeout: "tests.pollingTimeout",
GinkgoSkip: "tests.ginkgoSkip",
GinkgoFocus: "tests.focus",
TestsToRun: "tests.testsToRun",
SuppressSkipNotifications: "tests.suppressSkipNotifications",
CleanRuns: "tests.cleanRuns",
OperatorSkip: "tests.operatorSkip",
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
MetricsBucket: "tests.metricsBucket",
ServiceAccount: "tests.serviceAccount",
PollingTimeout: "tests.pollingTimeout",
GinkgoSkip: "tests.ginkgoSkip",
GinkgoFocus: "tests.focus",
TestsToRun: "tests.testsToRun",
SuppressSkipNotifications: "tests.suppressSkipNotifications",
CleanRuns: "tests.cleanRuns",
OperatorSkip: "tests.operatorSkip",
SkipClusterHealthChecks: "tests.skipClusterHealthChecks",
MetricsBucket: "tests.metricsBucket",
ServiceAccount: "tests.serviceAccount",
ClusterHealthChecksTimeout: "tests.clusterHealthChecksTimeout",
}

// Cluster config keys.
Expand Down Expand Up @@ -527,6 +534,9 @@ func init() {
viper.SetDefault(Tests.SkipClusterHealthChecks, false)
viper.BindEnv(Tests.OperatorSkip, "SKIP_CLUSTER_HEALTH_CHECKS")

viper.SetDefault(Tests.ClusterHealthChecksTimeout, "2h")
viper.BindEnv(Tests.ClusterHealthChecksTimeout, "CLUSTER_HEALTH_CHECKS_TIMEOUT")

viper.SetDefault(Tests.MetricsBucket, "osde2e-metrics")
viper.BindEnv(Tests.MetricsBucket, "METRICS_BUCKET")

Expand Down
24 changes: 23 additions & 1 deletion pkg/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package e2e
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"encoding/xml"
"fmt"
Expand All @@ -19,6 +20,7 @@ import (
"github.com/hpcloud/tail"
junit "github.com/joshdk/go-junit"
vegeta "github.com/tsenart/vegeta/lib"
"k8s.io/client-go/kubernetes"

pd "github.com/PagerDuty/go-pagerduty"
"github.com/onsi/ginkgo"
Expand All @@ -31,6 +33,7 @@ import (
"github.com/openshift/osde2e/pkg/common/aws"
"github.com/openshift/osde2e/pkg/common/cluster"
clusterutil "github.com/openshift/osde2e/pkg/common/cluster"
"github.com/openshift/osde2e/pkg/common/cluster/healthchecks"
"github.com/openshift/osde2e/pkg/common/clusterproperties"
"github.com/openshift/osde2e/pkg/common/config"
"github.com/openshift/osde2e/pkg/common/events"
Expand Down Expand Up @@ -121,13 +124,32 @@ func beforeSuite() bool {
log.Printf("Error while adding upgrade version property to cluster via OCM: %v", err)
}

err = clusterutil.WaitForClusterReady(cluster.ID(), nil)
clusterConfig, _, err := clusterutil.ClusterConfig(cluster.ID())
if err != nil {
log.Printf("Failed looking up cluster config for healthcheck: %v", err)
}
kubeClient, err := kubernetes.NewForConfig(clusterConfig)
if err != nil {
log.Printf("Error generating Kube Clientset: %v\n", err)
}
duration, err := time.ParseDuration(viper.GetString(config.Tests.ClusterHealthChecksTimeout))
if err != nil {
log.Printf("Failed parsing health check timeout: %v", err)
return false
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool.

Latent, but it looks like the other two conditions above might be missing return false?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, I am dumb. I had tunnel vision while making this change.

}
ctx, cancel := context.WithTimeout(context.Background(), duration)
defer cancel()
if viper.GetString(config.Tests.SkipClusterHealthChecks) != "false" {
log.Println("WARNING: Skipping cluster health checks is no longer supported, as they no longer introduce delay into the build. Ignoring your request to skip them.")
}
err = healthchecks.CheckHealthcheckJob(kubeClient, ctx, nil)
events.HandleErrorWithEvents(err, events.HealthCheckSuccessful, events.HealthCheckFailed)
if err != nil {
log.Printf("Cluster failed health check: %v", err)
getLogs()
return false
}
log.Println("Cluster is healthy and ready for testing")

if len(viper.GetString(config.Addons.IDs)) > 0 {
if viper.GetString(config.Provider) != "mock" {
Expand Down