openshift · jharrington22 · Mar 18, 2021 · Mar 9, 2021
diff --git a/README.md b/README.md
@@ -1,27 +1,22 @@
-# OSD Cluster Readiness Job
+# OSD Cluster Readiness
 
-- [OSD Cluster Readiness Job](#osd-cluster-readiness-job)
+- [OSD Cluster Readiness](#osd-cluster-readiness)
   - [Deploying](#deploying)
     - [Build the Image](#build-the-image)
-    - [Deploy the Job](#deploy-the-job)
+    - [Deploy](#deploy)
     - [Example](#example)
   - [Tunables](#tunables)
-    - [`MAX_CLUSTER_AGE_MINUTES`](#max_cluster_age_minutes)
     - [`CLEAN_CHECK_RUNS`](#clean_check_runs)
     - [`CLEAN_CHECK_INTERVAL_SECONDS`](#clean_check_interval_seconds)
     - [`FAILED_CHECK_INTERVAL_SECONDS`](#failed_check_interval_seconds)
   - [Keeping up with osde2e](#keeping-up-with-osde2e)
 - [TO DO](#to-do)
 
-This job silences alerts while Day2 configuration is loaded onto a cluster at initial provisioning, allowing it to not page on-call SREs for normal operations within the cluster.
-
-We poll cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211)
+This program polls cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211)
 once a minute (this is [configurable](#failed_check_interval_seconds)),
 until they all report healthy 20 times in a row ([configurable](#clean_check_runs))
 on 30s intervals ([configurable](#clean_check_interval_seconds)).
-By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).
-
-We base the duration of the silence on the estimated maximum time we think it should take for the full health check sequence to run, extending it as necessary.
+By default, we will exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).
 
 ## Deploying
 
@@ -38,7 +33,9 @@ If you wish to push to a specific registry, repository, or image name, you may o
 For example, for development purposes, you may wish to `export IMAGE_USER=my_quay_user`.
 See the [Makefile](Makefile) for the default values.
 
-### Deploy the Job
+### Deploy
+**NOTE:** In OSD, this program is managed by [configure-alertmanager-operator](https://github.com/openshift/configure-alertmanager-operator) via a Job it [defines internally](https://github.com/openshift/configure-alertmanager-operator/blob/master/pkg/readiness/defs/osd-cluster-ready.Job.yaml).
+In order to test locally, that needs to be disabled. (**TODO: How?**)
 
 ```
 make deploy
@@ -80,11 +77,6 @@ The following environment variables can be set in the container, e.g. by editing
 
 Remember that the values must be strings; so numeric values must be quoted.
 
-### `MAX_CLUSTER_AGE_MINUTES`
-The maximum age of the cluster, in minutes, after which we will clear any silences and exit "successfully".
-
-**Default:** `"120"` (two hours)
-
 ### `CLEAN_CHECK_RUNS`
 The number of consecutive health checks that must succeed before we declare the cluster truly healthy.
 
@@ -117,8 +109,5 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit
 
 # TO DO
 
-- [x] Look for existing active silences before creating a new one
 - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability
-- [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)).
-- [ ] Find out if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens.
 - [ ] Make [tunables](#tunables) configurable via `make deploy`.
diff --git a/deploy/60-osd-ready.Job.yaml b/deploy/60-osd-ready.Job.yaml
@@ -9,13 +9,7 @@ spec:
             name: osd-cluster-ready
         spec:
             containers:
-            # TODO: Remove this override once
-            # https://bugzilla.redhat.com/show_bug.cgi?id=1921413
-            # is resolved
-            - env:
-              - name: MAX_CLUSTER_AGE_MINUTES
-                value: "240"
-              name: osd-cluster-ready
+            - name: osd-cluster-ready
               image: quay.io/openshift-sre/osd-cluster-ready
               imagePullPolicy: Always
               command: ["/root/main"]

diff --git a/main.go b/main.go
@@ -4,24 +4,13 @@ import (
 	"fmt"
 	"log"
 	"os"
-	"os/exec"
 	"strconv"
-	"strings"
 	"time"
 
 	"github.com/openshift/osde2e/pkg/common/cluster"
-
-	"github.com/openshift/osd-cluster-ready/silence"
 )
 
 const (
-	// Maximum cluster age, in minutes, before we'll start ignoring it.
-	// This is in case the Job gets deployed on an already-initialized but unhealthy cluster:
-	// we don't want to silence alerts in that case.
-	maxClusterAgeKey = "MAX_CLUSTER_AGE_MINUTES"
-	// By default, ignore clusters older than two hours
-	maxClusterAgeDefault = 2 * 60
-
 	// The number of consecutive health checks that must succeed before we declare the cluster
 	// truly healthy.
 	cleanCheckRunsKey     = "CLEAN_CHECK_RUNS"
@@ -37,20 +26,6 @@ const (
 )
 
 func main() {
-	// New Alert Manager Silence Request
-	// TODO: Handle multiple silences being active
-	silenceReq := silence.New()
-
-	clusterBirth, err := getClusterCreationTime()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	maxClusterAge, err := getEnvInt(maxClusterAgeKey, maxClusterAgeDefault)
-	if err != nil {
-		log.Fatal(err)
-	}
-
 	cleanCheckRuns, err := getEnvInt(cleanCheckRunsKey, cleanCheckRunsDefault)
 	if err != nil {
 		log.Fatal(err)
@@ -66,82 +41,12 @@ func main() {
 		log.Fatal(err)
 	}
 
-	// maxHealthCheckDuration is a heuristic for how long the full health check
-	// loop should take. This is used to push out the expiry of active silences to make
-	// sure they don't expire while we're in that loop; so err on the high side:
-	// - One run of health checks is pretty fast -- anecdotally sub-second -- but give it
-	//   ten seconds.
-	// - Add failedCheckInterval since we do that sleep *after* a failed health check.
-	// - Add another two minutes of fudge factor.
-	maxHealthCheckDuration := time.Duration(cleanCheckRuns * (cleanCheckInterval + 10) + failedCheckInterval + 120) * time.Second
-	log.Printf("Using a silence expiry window of %s.", maxHealthCheckDuration)
-
 	for {
-
-		// TODO: If silenceReq has an ID here, no need to look for another active silence
-		silenceReq, err := silenceReq.FindExisting()
-		if err != nil {
-			log.Fatal(err)
-		}
-
-		if clusterTooOld(clusterBirth, maxClusterAge) {
-			log.Printf("Cluster is older than %d minutes. Exiting Cleanly.", maxClusterAge)
-			if silenceReq.Active() {
-				// TODO: There might be more than one silence to remove
-				err = silenceReq.Remove()
-				if err != nil {
-					log.Fatal(err)
-				}
-			}
-			os.Exit(0)
-		}
-
-		if !silenceReq.Active() {
-			// If there are no existing active silences, create one.
-			// We only get here in two scenarios:
-			// - This is the first iteration of the loop. The vast majority of the time
-			//   this is the first ever run of the job, and it's happening shortly after
-			//   the birth of the cluster.
-			// - The previous iteration took longer than maxHealthCheckDuration
-			//   and the previously-created silence expired.
-			_, err = silenceReq.Build(maxHealthCheckDuration).Send()
-			if err != nil {
-				log.Fatal(err)
-			}
-
-		} else {
-			// If there is an existing silence, ensure it's not going to expire before
-			// our healthchecks are complete.
-			willExpire, err := silenceReq.WillExpireBy(maxHealthCheckDuration)
-			if err != nil {
-				log.Fatal(err)
-			}
-			if willExpire {
-				log.Printf("Silence expires in less than %s. Extending.", maxHealthCheckDuration)
-				// Creating a new silence automatically expires the existing one(s).
-				_, err = silenceReq.Build(maxHealthCheckDuration).Send()
-				if err != nil {
-					log.Fatal(err)
-				}
-			}
-		}
-
 		healthy, err := isClusterHealthy(cleanCheckRuns, cleanCheckInterval)
 		if err != nil {
 			log.Fatal(err)
 		}
-
 		if healthy {
-			// Use this to log how much time was left in the active silence. This can
-			// help us fine-tune maxHealthCheckDuration in the future.
-			// TODO: If this is negative, it has already expired and we don't have to
-			// remove it. But WillExpireBy would have to tell us that somehow.
-			_, err = silenceReq.WillExpireBy(maxHealthCheckDuration)
-			// TODO: There might be more than one silence to remove
-			err = silenceReq.Remove()
-			if err != nil {
-				log.Fatal(err)
-			}
 			os.Exit(0)
 		}
 
@@ -152,31 +57,6 @@ func main() {
 	// UNREACHED
 }
 
-func getClusterCreationTime() (time.Time, error) {
-	for i := 1; i <= 300; i++ { // try once a second or so for 5 minutes-ish
-		ex := "oc exec -n openshift-monitoring prometheus-k8s-0 -c prometheus -- curl localhost:9090/api/v1/query --silent --data-urlencode 'query=cluster_version' | jq -r '.data.result[] | select(.metric.type==\"initial\") | .value[1]'"
-		promCmd := exec.Command("bash", "-c", ex)
-		promCmd.Stderr = os.Stderr
-		resp, err := promCmd.Output()
-		if err != nil {
-			log.Printf("Attempt %d to query for cluster age failed. %v", i, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		respTrimmed := strings.TrimSuffix(string(resp), "\n")
-		initTime, err := strconv.ParseInt(respTrimmed, 10, 64)
-		if err != nil {
-			log.Printf("Error casting Epoch time to int. %s\nErr: %v", resp, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		clusterCreation := time.Unix(initTime, 0)
-		log.Printf("Cluster Created %v", clusterCreation.UTC())
-		return clusterCreation, nil
-	}
-	return time.Unix(0, 0), fmt.Errorf("there was an error getting cluster creation time")
-}
-
 // getEnvInt returns the integer value of the environment variable with the specified `key`.
 // If the env var is unspecified/empty, the `def` value is returned.
 // The error is non-nil if the env var is nonempty but cannot be parsed as an int.
@@ -198,11 +78,6 @@ func getEnvInt(key string, def int) (int, error) {
 	return intVal, nil
 }
 
-func clusterTooOld(clusterBirth time.Time, maxAgeMinutes int) bool {
-	maxAge := time.Now().Add(time.Duration(-maxAgeMinutes) * time.Minute)
-	return clusterBirth.Before(maxAge)
-}
-
 // doHealthCheck performs one instance of the health check.
 // Logs what happens.
 // Returns (true, err) if all health checks succeeded.