From c9f1f45e659a085fa0f83b6c17fcddd03e06ea1f Mon Sep 17 00:00:00 2001
From: Eric Fried <efried@redhat.com>
Date: Tue, 9 Mar 2021 11:46:43 -0600
Subject: [PATCH] OSD-6646: Only do health checks

This commit removes all "max cluster age" and silencing logic, which will
[henceforth be done in configure-alertmanager-operator](https://github.com/openshift/configure-alertmanager-operator/pull/147).

OSD-6646
---
 README.md                    |  27 ++---
 deploy/60-osd-ready.Job.yaml |   8 +-
 main.go                      | 125 -----------------------
 silence/silence.go           | 187 -----------------------------------
 4 files changed, 9 insertions(+), 338 deletions(-)
 delete mode 100644 silence/silence.go

diff --git a/README.md b/README.md
index 9a15d76..589d8f2 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,22 @@
-# OSD Cluster Readiness Job
+# OSD Cluster Readiness
 
-- [OSD Cluster Readiness Job](#osd-cluster-readiness-job)
+- [OSD Cluster Readiness](#osd-cluster-readiness)
   - [Deploying](#deploying)
     - [Build the Image](#build-the-image)
-    - [Deploy the Job](#deploy-the-job)
+    - [Deploy](#deploy)
     - [Example](#example)
   - [Tunables](#tunables)
-    - [`MAX_CLUSTER_AGE_MINUTES`](#max_cluster_age_minutes)
     - [`CLEAN_CHECK_RUNS`](#clean_check_runs)
     - [`CLEAN_CHECK_INTERVAL_SECONDS`](#clean_check_interval_seconds)
     - [`FAILED_CHECK_INTERVAL_SECONDS`](#failed_check_interval_seconds)
   - [Keeping up with osde2e](#keeping-up-with-osde2e)
 - [TO DO](#to-do)
 
-This job silences alerts while Day2 configuration is loaded onto a cluster at initial provisioning, allowing it to not page on-call SREs for normal operations within the cluster.
-
-We poll cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211)
+This program polls cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211)
 once a minute (this is [configurable](#failed_check_interval_seconds)),
 until they all report healthy 20 times in a row ([configurable](#clean_check_runs))
 on 30s intervals ([configurable](#clean_check_interval_seconds)).
-By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).
-
-We base the duration of the silence on the estimated maximum time we think it should take for the full health check sequence to run, extending it as necessary.
+By default, we will exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).
 
 ## Deploying
 
@@ -38,7 +33,9 @@ If you wish to push to a specific registry, repository, or image name, you may o
 For example, for development purposes, you may wish to `export IMAGE_USER=my_quay_user`.
 See the [Makefile](Makefile) for the default values.
 
-### Deploy the Job
+### Deploy
+**NOTE:** In OSD, this program is managed by [configure-alertmanager-operator](https://github.com/openshift/configure-alertmanager-operator) via a Job it [defines internally](https://github.com/openshift/configure-alertmanager-operator/blob/master/pkg/readiness/defs/osd-cluster-ready.Job.yaml).
+In order to test locally, that needs to be disabled. (**TODO: How?**)
 
 ```
 make deploy
@@ -80,11 +77,6 @@ The following environment variables can be set in the container, e.g. by editing
 
 Remember that the values must be strings; so numeric values must be quoted.
 
-### `MAX_CLUSTER_AGE_MINUTES`
-The maximum age of the cluster, in minutes, after which we will clear any silences and exit "successfully".
-
-**Default:** `"120"` (two hours)
-
 ### `CLEAN_CHECK_RUNS`
 The number of consecutive health checks that must succeed before we declare the cluster truly healthy.
 
@@ -117,8 +109,5 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit
 
 # TO DO
 
-- [x] Look for existing active silences before creating a new one
 - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability
-- [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)).
-- [ ] Find out if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens.
 - [ ] Make [tunables](#tunables) configurable via `make deploy`.
diff --git a/deploy/60-osd-ready.Job.yaml b/deploy/60-osd-ready.Job.yaml
index 9bc415d..3e5bf22 100644
--- a/deploy/60-osd-ready.Job.yaml
+++ b/deploy/60-osd-ready.Job.yaml
@@ -9,13 +9,7 @@ spec:
             name: osd-cluster-ready
         spec:
             containers:
-            # TODO: Remove this override once
-            # https://bugzilla.redhat.com/show_bug.cgi?id=1921413
-            # is resolved
-            - env:
-              - name: MAX_CLUSTER_AGE_MINUTES
-                value: "240"
-              name: osd-cluster-ready
+            - name: osd-cluster-ready
               image: quay.io/openshift-sre/osd-cluster-ready
               imagePullPolicy: Always
               command: ["/root/main"]
diff --git a/main.go b/main.go
index 16cfe5c..5afab14 100644
--- a/main.go
+++ b/main.go
@@ -4,24 +4,13 @@ import (
 	"fmt"
 	"log"
 	"os"
-	"os/exec"
 	"strconv"
-	"strings"
 	"time"
 
 	"github.com/openshift/osde2e/pkg/common/cluster"
-
-	"github.com/openshift/osd-cluster-ready/silence"
 )
 
 const (
-	// Maximum cluster age, in minutes, before we'll start ignoring it.
-	// This is in case the Job gets deployed on an already-initialized but unhealthy cluster:
-	// we don't want to silence alerts in that case.
-	maxClusterAgeKey = "MAX_CLUSTER_AGE_MINUTES"
-	// By default, ignore clusters older than two hours
-	maxClusterAgeDefault = 2 * 60
-
 	// The number of consecutive health checks that must succeed before we declare the cluster
 	// truly healthy.
 	cleanCheckRunsKey     = "CLEAN_CHECK_RUNS"
@@ -37,20 +26,6 @@ const (
 )
 
 func main() {
-	// New Alert Manager Silence Request
-	// TODO: Handle multiple silences being active
-	silenceReq := silence.New()
-
-	clusterBirth, err := getClusterCreationTime()
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	maxClusterAge, err := getEnvInt(maxClusterAgeKey, maxClusterAgeDefault)
-	if err != nil {
-		log.Fatal(err)
-	}
-
 	cleanCheckRuns, err := getEnvInt(cleanCheckRunsKey, cleanCheckRunsDefault)
 	if err != nil {
 		log.Fatal(err)
@@ -66,82 +41,12 @@ func main() {
 		log.Fatal(err)
 	}
 
-	// maxHealthCheckDuration is a heuristic for how long the full health check
-	// loop should take. This is used to push out the expiry of active silences to make
-	// sure they don't expire while we're in that loop; so err on the high side:
-	// - One run of health checks is pretty fast -- anecdotally sub-second -- but give it
-	//   ten seconds.
-	// - Add failedCheckInterval since we do that sleep *after* a failed health check.
-	// - Add another two minutes of fudge factor.
-	maxHealthCheckDuration := time.Duration(cleanCheckRuns * (cleanCheckInterval + 10) + failedCheckInterval + 120) * time.Second
-	log.Printf("Using a silence expiry window of %s.", maxHealthCheckDuration)
-
 	for {
-
-		// TODO: If silenceReq has an ID here, no need to look for another active silence
-		silenceReq, err := silenceReq.FindExisting()
-		if err != nil {
-			log.Fatal(err)
-		}
-
-		if clusterTooOld(clusterBirth, maxClusterAge) {
-			log.Printf("Cluster is older than %d minutes. Exiting Cleanly.", maxClusterAge)
-			if silenceReq.Active() {
-				// TODO: There might be more than one silence to remove
-				err = silenceReq.Remove()
-				if err != nil {
-					log.Fatal(err)
-				}
-			}
-			os.Exit(0)
-		}
-
-		if !silenceReq.Active() {
-			// If there are no existing active silences, create one.
-			// We only get here in two scenarios:
-			// - This is the first iteration of the loop. The vast majority of the time
-			//   this is the first ever run of the job, and it's happening shortly after
-			//   the birth of the cluster.
-			// - The previous iteration took longer than maxHealthCheckDuration
-			//   and the previously-created silence expired.
-			_, err = silenceReq.Build(maxHealthCheckDuration).Send()
-			if err != nil {
-				log.Fatal(err)
-			}
-
-		} else {
-			// If there is an existing silence, ensure it's not going to expire before
-			// our healthchecks are complete.
-			willExpire, err := silenceReq.WillExpireBy(maxHealthCheckDuration)
-			if err != nil {
-				log.Fatal(err)
-			}
-			if willExpire {
-				log.Printf("Silence expires in less than %s. Extending.", maxHealthCheckDuration)
-				// Creating a new silence automatically expires the existing one(s).
-				_, err = silenceReq.Build(maxHealthCheckDuration).Send()
-				if err != nil {
-					log.Fatal(err)
-				}
-			}
-		}
-
 		healthy, err := isClusterHealthy(cleanCheckRuns, cleanCheckInterval)
 		if err != nil {
 			log.Fatal(err)
 		}
-
 		if healthy {
-			// Use this to log how much time was left in the active silence. This can
-			// help us fine-tune maxHealthCheckDuration in the future.
-			// TODO: If this is negative, it has already expired and we don't have to
-			// remove it. But WillExpireBy would have to tell us that somehow.
-			_, err = silenceReq.WillExpireBy(maxHealthCheckDuration)
-			// TODO: There might be more than one silence to remove
-			err = silenceReq.Remove()
-			if err != nil {
-				log.Fatal(err)
-			}
 			os.Exit(0)
 		}
 
@@ -152,31 +57,6 @@ func main() {
 	// UNREACHED
 }
 
-func getClusterCreationTime() (time.Time, error) {
-	for i := 1; i <= 300; i++ { // try once a second or so for 5 minutes-ish
-		ex := "oc exec -n openshift-monitoring prometheus-k8s-0 -c prometheus -- curl localhost:9090/api/v1/query --silent --data-urlencode 'query=cluster_version' | jq -r '.data.result[] | select(.metric.type==\"initial\") | .value[1]'"
-		promCmd := exec.Command("bash", "-c", ex)
-		promCmd.Stderr = os.Stderr
-		resp, err := promCmd.Output()
-		if err != nil {
-			log.Printf("Attempt %d to query for cluster age failed. %v", i, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		respTrimmed := strings.TrimSuffix(string(resp), "\n")
-		initTime, err := strconv.ParseInt(respTrimmed, 10, 64)
-		if err != nil {
-			log.Printf("Error casting Epoch time to int. %s\nErr: %v", resp, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		clusterCreation := time.Unix(initTime, 0)
-		log.Printf("Cluster Created %v", clusterCreation.UTC())
-		return clusterCreation, nil
-	}
-	return time.Unix(0, 0), fmt.Errorf("there was an error getting cluster creation time")
-}
-
 // getEnvInt returns the integer value of the environment variable with the specified `key`.
 // If the env var is unspecified/empty, the `def` value is returned.
 // The error is non-nil if the env var is nonempty but cannot be parsed as an int.
@@ -198,11 +78,6 @@ func getEnvInt(key string, def int) (int, error) {
 	return intVal, nil
 }
 
-func clusterTooOld(clusterBirth time.Time, maxAgeMinutes int) bool {
-	maxAge := time.Now().Add(time.Duration(-maxAgeMinutes) * time.Minute)
-	return clusterBirth.Before(maxAge)
-}
-
 // doHealthCheck performs one instance of the health check.
 // Logs what happens.
 // Returns (true, err) if all health checks succeeded.
diff --git a/silence/silence.go b/silence/silence.go
deleted file mode 100644
index 11b271a..0000000
--- a/silence/silence.go
+++ /dev/null
@@ -1,187 +0,0 @@
-package silence
-
-import (
-	"encoding/json"
-	"fmt"
-	"log"
-	"os"
-	"os/exec"
-	"time"
-)
-
-const createdBy = "OSD Cluster Readiness Job"
-
-// Request represents an Alertmanager silence request object
-type Request struct {
-	ID        string       `json:"id"`
-	Status    silenceState `json:"status"`
-	Matchers  []matcher    `json:"matchers"`
-	StartsAt  string       `json:"startsAt"`
-	EndsAt    string       `json:"endsAt"`
-	CreatedBy string       `json:"createdBy"`
-	Comment   string       `json:"comment"`
-}
-
-type silenceState struct {
-	State string `json:"state"`
-}
-
-type matcher struct {
-	Name    string `json:"name"`
-	Value   string `json:"value"`
-	IsRegex bool   `json:"isRegex"`
-}
-
-type getSilenceResponse []*Request
-
-type silenceResponse struct {
-	ID string `json:"silenceID"`
-}
-
-// New returns a new silence request object
-func New() *Request {
-	return &Request{}
-}
-
-// FindExisting looks for an existing, active silence that was created by us. If found,
-// its ID is returned; otherwise the empty string is returned. The latter is not an
-// error condition.
-// TODO: Handle muliple silences being active, currently we'll return the first one we fine
-// that is active and created by `createdBy`
-func (sr *Request) FindExisting() (*Request, error) {
-	for i := 1; i <= 300; i++ { // try once a second or so for 5-ish minutes
-		log.Printf("Checking for silences")
-		cmdstr := "oc exec -n openshift-monitoring alertmanager-main-0 -c alertmanager -- curl --silent localhost:9093/api/v2/silences -X GET"
-		silenceGetCmd := exec.Command("bash", "-c", cmdstr)
-		silenceGetCmd.Stderr = os.Stderr
-		resp, err := silenceGetCmd.Output()
-		if err != nil {
-			log.Printf("Attempt %d to query for existing silences failed. %v", i, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		var silences getSilenceResponse
-		err = json.Unmarshal(resp, &silences)
-		if err != nil {
-			log.Printf("There was an error unmarshalling get silence response")
-			return sr, err
-		}
-
-		if len(silences) == 0 {
-			log.Printf("No Silences Present")
-			return sr, nil
-		}
-
-		for _, silence := range silences {
-			if silence.CreatedBy != createdBy {
-				continue
-			}
-
-			if !silence.Active() {
-				log.Printf("Silence %s is not active.", silence.ID)
-				continue
-			}
-
-			sr = silence
-			log.Printf("Found silence created by job: %s", sr.ID)
-
-			return sr, nil
-		}
-
-		log.Printf("No silences created by job found.")
-		return sr, nil
-	}
-
-	return sr, fmt.Errorf("unable to get a list of existing silences")
-}
-
-func (sr *Request) Build(expiryPeriod time.Duration) *Request {
-	// Create the Silence
-	now := time.Now().UTC()
-	end := now.Add(expiryPeriod)
-
-	allMatcher := matcher{}
-	allMatcher.Name = "severity"
-	allMatcher.Value = "info|warning|critical"
-	allMatcher.IsRegex = true
-
-	sr.Matchers = []matcher{allMatcher}
-	sr.StartsAt = now.Format(time.RFC3339)
-	sr.EndsAt = end.Format(time.RFC3339)
-	sr.CreatedBy = createdBy
-	sr.Comment = "Created By the Cluster Readiness Job to silence any alerts during normal provisioning"
-
-	return sr
-}
-
-func (sr *Request) Send() (*silenceResponse, error) {
-
-	silenceJSON, err := json.Marshal(sr)
-	if err != nil {
-		return nil, fmt.Errorf("There was an error marshalling JSON: %v", silenceJSON)
-	}
-
-	for {
-		// Attempt to run once every 30 seconds until this succeeds
-		// to account for if the alertmanager is not ready before
-		// we start trying to silence it.
-		silenceCmd := exec.Command("oc", "exec", "-n", "openshift-monitoring", "alertmanager-main-0", "-c", "alertmanager", "--", "curl", "localhost:9093/api/v2/silences", "--silent", "-X", "POST", "-H", "Content-Type: application/json", "--data", string(silenceJSON))
-		silenceCmd.Stderr = os.Stderr
-		resp, err := silenceCmd.Output()
-		if err != nil {
-			log.Printf("Silence Failed. %v", err)
-			time.Sleep(30 * time.Second)
-			continue
-		}
-		var silenceResp silenceResponse
-		e := json.Unmarshal(resp, &silenceResp)
-		if e != nil {
-			return nil, fmt.Errorf("There was an error Unmarshalling response: %v", e)
-		}
-		log.Printf("Silence Created with ID %s.", silenceResp.ID)
-		return &silenceResp, nil
-	}
-}
-
-// Remove deletes the silence with the given sr.ID
-func (sr *Request) Remove() error {
-	log.Printf("Removing Silence %s\n", sr.ID)
-	for i := 0; i < 5; i++ {
-		// Attempt up to 5 times to unsilence the cluster
-		unsilenceCommand := exec.Command("oc", "exec", "-n", "openshift-monitoring", "alertmanager-main-0", "-c", "alertmanager", "--", "curl", fmt.Sprintf("localhost:9093/api/v2/silence/%s", sr.ID), "--silent", "-X", "DELETE")
-		unsilenceCommand.Stderr = os.Stderr
-		err := unsilenceCommand.Run()
-		if err != nil {
-			log.Printf("Attempt %d to unsilence failed. %v", i, err)
-			time.Sleep(1 * time.Second)
-			continue
-		}
-		log.Println("Silence Successfully Removed.")
-		return nil
-	}
-	return fmt.Errorf("there was an error unsilencing the cluster")
-}
-
-// WillExpireBy returns:
-// - bool: true if the remaining time on the AlertManager Silence is less than the expiryPeriod.
-// - error: nil unless the silence request's end time can't be parsed.
-func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) {
-	// Parse end time of Alertmanager Silence
-	end, err := time.Parse(time.RFC3339, sr.EndsAt)
-	if err != nil {
-		return false, err
-	}
-
-	// Find the remaining time left on the silence
-	remaining := end.Sub(time.Now())
-
-	log.Printf("Time remaining on active silence: %s\n", remaining)
-
-	// Return bool if the remaining time is less than the expiryPeriod
-	return remaining < expiryPeriod, nil
-}
-
-// Active returns a bool if the silence is Active
-func (sr *Request) Active() bool {
-	return sr.ID != "" && sr.Status.State == "active"
-}