From b3ecbf9040ad0a8e77a51b976b8e6fbfa040bfe0 Mon Sep 17 00:00:00 2001 From: Eric Fried Date: Tue, 9 Mar 2021 11:46:43 -0600 Subject: [PATCH] OSD-6646: Only do health checks This commit removes all "max cluster age" and silencing logic, which will [henceforth be done in configure-alertmanager-operator](https://github.com/openshift/configure-alertmanager-operator/pull/143). OSD-6646 --- README.md | 17 +---- main.go | 125 ------------------------------ silence/silence.go | 187 --------------------------------------------- 3 files changed, 2 insertions(+), 327 deletions(-) delete mode 100644 silence/silence.go diff --git a/README.md b/README.md index 9a15d76..183391b 100644 --- a/README.md +++ b/README.md @@ -6,22 +6,17 @@ - [Deploy the Job](#deploy-the-job) - [Example](#example) - [Tunables](#tunables) - - [`MAX_CLUSTER_AGE_MINUTES`](#max_cluster_age_minutes) - [`CLEAN_CHECK_RUNS`](#clean_check_runs) - [`CLEAN_CHECK_INTERVAL_SECONDS`](#clean_check_interval_seconds) - [`FAILED_CHECK_INTERVAL_SECONDS`](#failed_check_interval_seconds) - [Keeping up with osde2e](#keeping-up-with-osde2e) - [TO DO](#to-do) -This job silences alerts while Day2 configuration is loaded onto a cluster at initial provisioning, allowing it to not page on-call SREs for normal operations within the cluster. - -We poll cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211) +This job polls cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211) once a minute (this is [configurable](#failed_check_interval_seconds)), until they all report healthy 20 times in a row ([configurable](#clean_check_runs)) on 30s intervals ([configurable](#clean_check_interval_seconds)). -By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)). - -We base the duration of the silence on the estimated maximum time we think it should take for the full health check sequence to run, extending it as necessary. +By default, we will exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)). ## Deploying @@ -80,11 +75,6 @@ The following environment variables can be set in the container, e.g. by editing Remember that the values must be strings; so numeric values must be quoted. -### `MAX_CLUSTER_AGE_MINUTES` -The maximum age of the cluster, in minutes, after which we will clear any silences and exit "successfully". - -**Default:** `"120"` (two hours) - ### `CLEAN_CHECK_RUNS` The number of consecutive health checks that must succeed before we declare the cluster truly healthy. @@ -117,8 +107,5 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit # TO DO -- [x] Look for existing active silences before creating a new one - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability -- [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)). -- [ ] Find out if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens. - [ ] Make [tunables](#tunables) configurable via `make deploy`. diff --git a/main.go b/main.go index 16cfe5c..5afab14 100644 --- a/main.go +++ b/main.go @@ -4,24 +4,13 @@ import ( "fmt" "log" "os" - "os/exec" "strconv" - "strings" "time" "github.com/openshift/osde2e/pkg/common/cluster" - - "github.com/openshift/osd-cluster-ready/silence" ) const ( - // Maximum cluster age, in minutes, before we'll start ignoring it. - // This is in case the Job gets deployed on an already-initialized but unhealthy cluster: - // we don't want to silence alerts in that case. - maxClusterAgeKey = "MAX_CLUSTER_AGE_MINUTES" - // By default, ignore clusters older than two hours - maxClusterAgeDefault = 2 * 60 - // The number of consecutive health checks that must succeed before we declare the cluster // truly healthy. cleanCheckRunsKey = "CLEAN_CHECK_RUNS" @@ -37,20 +26,6 @@ const ( ) func main() { - // New Alert Manager Silence Request - // TODO: Handle multiple silences being active - silenceReq := silence.New() - - clusterBirth, err := getClusterCreationTime() - if err != nil { - log.Fatal(err) - } - - maxClusterAge, err := getEnvInt(maxClusterAgeKey, maxClusterAgeDefault) - if err != nil { - log.Fatal(err) - } - cleanCheckRuns, err := getEnvInt(cleanCheckRunsKey, cleanCheckRunsDefault) if err != nil { log.Fatal(err) @@ -66,82 +41,12 @@ func main() { log.Fatal(err) } - // maxHealthCheckDuration is a heuristic for how long the full health check - // loop should take. This is used to push out the expiry of active silences to make - // sure they don't expire while we're in that loop; so err on the high side: - // - One run of health checks is pretty fast -- anecdotally sub-second -- but give it - // ten seconds. - // - Add failedCheckInterval since we do that sleep *after* a failed health check. - // - Add another two minutes of fudge factor. - maxHealthCheckDuration := time.Duration(cleanCheckRuns * (cleanCheckInterval + 10) + failedCheckInterval + 120) * time.Second - log.Printf("Using a silence expiry window of %s.", maxHealthCheckDuration) - for { - - // TODO: If silenceReq has an ID here, no need to look for another active silence - silenceReq, err := silenceReq.FindExisting() - if err != nil { - log.Fatal(err) - } - - if clusterTooOld(clusterBirth, maxClusterAge) { - log.Printf("Cluster is older than %d minutes. Exiting Cleanly.", maxClusterAge) - if silenceReq.Active() { - // TODO: There might be more than one silence to remove - err = silenceReq.Remove() - if err != nil { - log.Fatal(err) - } - } - os.Exit(0) - } - - if !silenceReq.Active() { - // If there are no existing active silences, create one. - // We only get here in two scenarios: - // - This is the first iteration of the loop. The vast majority of the time - // this is the first ever run of the job, and it's happening shortly after - // the birth of the cluster. - // - The previous iteration took longer than maxHealthCheckDuration - // and the previously-created silence expired. - _, err = silenceReq.Build(maxHealthCheckDuration).Send() - if err != nil { - log.Fatal(err) - } - - } else { - // If there is an existing silence, ensure it's not going to expire before - // our healthchecks are complete. - willExpire, err := silenceReq.WillExpireBy(maxHealthCheckDuration) - if err != nil { - log.Fatal(err) - } - if willExpire { - log.Printf("Silence expires in less than %s. Extending.", maxHealthCheckDuration) - // Creating a new silence automatically expires the existing one(s). - _, err = silenceReq.Build(maxHealthCheckDuration).Send() - if err != nil { - log.Fatal(err) - } - } - } - healthy, err := isClusterHealthy(cleanCheckRuns, cleanCheckInterval) if err != nil { log.Fatal(err) } - if healthy { - // Use this to log how much time was left in the active silence. This can - // help us fine-tune maxHealthCheckDuration in the future. - // TODO: If this is negative, it has already expired and we don't have to - // remove it. But WillExpireBy would have to tell us that somehow. - _, err = silenceReq.WillExpireBy(maxHealthCheckDuration) - // TODO: There might be more than one silence to remove - err = silenceReq.Remove() - if err != nil { - log.Fatal(err) - } os.Exit(0) } @@ -152,31 +57,6 @@ func main() { // UNREACHED } -func getClusterCreationTime() (time.Time, error) { - for i := 1; i <= 300; i++ { // try once a second or so for 5 minutes-ish - ex := "oc exec -n openshift-monitoring prometheus-k8s-0 -c prometheus -- curl localhost:9090/api/v1/query --silent --data-urlencode 'query=cluster_version' | jq -r '.data.result[] | select(.metric.type==\"initial\") | .value[1]'" - promCmd := exec.Command("bash", "-c", ex) - promCmd.Stderr = os.Stderr - resp, err := promCmd.Output() - if err != nil { - log.Printf("Attempt %d to query for cluster age failed. %v", i, err) - time.Sleep(1 * time.Second) - continue - } - respTrimmed := strings.TrimSuffix(string(resp), "\n") - initTime, err := strconv.ParseInt(respTrimmed, 10, 64) - if err != nil { - log.Printf("Error casting Epoch time to int. %s\nErr: %v", resp, err) - time.Sleep(1 * time.Second) - continue - } - clusterCreation := time.Unix(initTime, 0) - log.Printf("Cluster Created %v", clusterCreation.UTC()) - return clusterCreation, nil - } - return time.Unix(0, 0), fmt.Errorf("there was an error getting cluster creation time") -} - // getEnvInt returns the integer value of the environment variable with the specified `key`. // If the env var is unspecified/empty, the `def` value is returned. // The error is non-nil if the env var is nonempty but cannot be parsed as an int. @@ -198,11 +78,6 @@ func getEnvInt(key string, def int) (int, error) { return intVal, nil } -func clusterTooOld(clusterBirth time.Time, maxAgeMinutes int) bool { - maxAge := time.Now().Add(time.Duration(-maxAgeMinutes) * time.Minute) - return clusterBirth.Before(maxAge) -} - // doHealthCheck performs one instance of the health check. // Logs what happens. // Returns (true, err) if all health checks succeeded. diff --git a/silence/silence.go b/silence/silence.go deleted file mode 100644 index 11b271a..0000000 --- a/silence/silence.go +++ /dev/null @@ -1,187 +0,0 @@ -package silence - -import ( - "encoding/json" - "fmt" - "log" - "os" - "os/exec" - "time" -) - -const createdBy = "OSD Cluster Readiness Job" - -// Request represents an Alertmanager silence request object -type Request struct { - ID string `json:"id"` - Status silenceState `json:"status"` - Matchers []matcher `json:"matchers"` - StartsAt string `json:"startsAt"` - EndsAt string `json:"endsAt"` - CreatedBy string `json:"createdBy"` - Comment string `json:"comment"` -} - -type silenceState struct { - State string `json:"state"` -} - -type matcher struct { - Name string `json:"name"` - Value string `json:"value"` - IsRegex bool `json:"isRegex"` -} - -type getSilenceResponse []*Request - -type silenceResponse struct { - ID string `json:"silenceID"` -} - -// New returns a new silence request object -func New() *Request { - return &Request{} -} - -// FindExisting looks for an existing, active silence that was created by us. If found, -// its ID is returned; otherwise the empty string is returned. The latter is not an -// error condition. -// TODO: Handle muliple silences being active, currently we'll return the first one we fine -// that is active and created by `createdBy` -func (sr *Request) FindExisting() (*Request, error) { - for i := 1; i <= 300; i++ { // try once a second or so for 5-ish minutes - log.Printf("Checking for silences") - cmdstr := "oc exec -n openshift-monitoring alertmanager-main-0 -c alertmanager -- curl --silent localhost:9093/api/v2/silences -X GET" - silenceGetCmd := exec.Command("bash", "-c", cmdstr) - silenceGetCmd.Stderr = os.Stderr - resp, err := silenceGetCmd.Output() - if err != nil { - log.Printf("Attempt %d to query for existing silences failed. %v", i, err) - time.Sleep(1 * time.Second) - continue - } - var silences getSilenceResponse - err = json.Unmarshal(resp, &silences) - if err != nil { - log.Printf("There was an error unmarshalling get silence response") - return sr, err - } - - if len(silences) == 0 { - log.Printf("No Silences Present") - return sr, nil - } - - for _, silence := range silences { - if silence.CreatedBy != createdBy { - continue - } - - if !silence.Active() { - log.Printf("Silence %s is not active.", silence.ID) - continue - } - - sr = silence - log.Printf("Found silence created by job: %s", sr.ID) - - return sr, nil - } - - log.Printf("No silences created by job found.") - return sr, nil - } - - return sr, fmt.Errorf("unable to get a list of existing silences") -} - -func (sr *Request) Build(expiryPeriod time.Duration) *Request { - // Create the Silence - now := time.Now().UTC() - end := now.Add(expiryPeriod) - - allMatcher := matcher{} - allMatcher.Name = "severity" - allMatcher.Value = "info|warning|critical" - allMatcher.IsRegex = true - - sr.Matchers = []matcher{allMatcher} - sr.StartsAt = now.Format(time.RFC3339) - sr.EndsAt = end.Format(time.RFC3339) - sr.CreatedBy = createdBy - sr.Comment = "Created By the Cluster Readiness Job to silence any alerts during normal provisioning" - - return sr -} - -func (sr *Request) Send() (*silenceResponse, error) { - - silenceJSON, err := json.Marshal(sr) - if err != nil { - return nil, fmt.Errorf("There was an error marshalling JSON: %v", silenceJSON) - } - - for { - // Attempt to run once every 30 seconds until this succeeds - // to account for if the alertmanager is not ready before - // we start trying to silence it. - silenceCmd := exec.Command("oc", "exec", "-n", "openshift-monitoring", "alertmanager-main-0", "-c", "alertmanager", "--", "curl", "localhost:9093/api/v2/silences", "--silent", "-X", "POST", "-H", "Content-Type: application/json", "--data", string(silenceJSON)) - silenceCmd.Stderr = os.Stderr - resp, err := silenceCmd.Output() - if err != nil { - log.Printf("Silence Failed. %v", err) - time.Sleep(30 * time.Second) - continue - } - var silenceResp silenceResponse - e := json.Unmarshal(resp, &silenceResp) - if e != nil { - return nil, fmt.Errorf("There was an error Unmarshalling response: %v", e) - } - log.Printf("Silence Created with ID %s.", silenceResp.ID) - return &silenceResp, nil - } -} - -// Remove deletes the silence with the given sr.ID -func (sr *Request) Remove() error { - log.Printf("Removing Silence %s\n", sr.ID) - for i := 0; i < 5; i++ { - // Attempt up to 5 times to unsilence the cluster - unsilenceCommand := exec.Command("oc", "exec", "-n", "openshift-monitoring", "alertmanager-main-0", "-c", "alertmanager", "--", "curl", fmt.Sprintf("localhost:9093/api/v2/silence/%s", sr.ID), "--silent", "-X", "DELETE") - unsilenceCommand.Stderr = os.Stderr - err := unsilenceCommand.Run() - if err != nil { - log.Printf("Attempt %d to unsilence failed. %v", i, err) - time.Sleep(1 * time.Second) - continue - } - log.Println("Silence Successfully Removed.") - return nil - } - return fmt.Errorf("there was an error unsilencing the cluster") -} - -// WillExpireBy returns: -// - bool: true if the remaining time on the AlertManager Silence is less than the expiryPeriod. -// - error: nil unless the silence request's end time can't be parsed. -func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) { - // Parse end time of Alertmanager Silence - end, err := time.Parse(time.RFC3339, sr.EndsAt) - if err != nil { - return false, err - } - - // Find the remaining time left on the silence - remaining := end.Sub(time.Now()) - - log.Printf("Time remaining on active silence: %s\n", remaining) - - // Return bool if the remaining time is less than the expiryPeriod - return remaining < expiryPeriod, nil -} - -// Active returns a bool if the silence is Active -func (sr *Request) Active() bool { - return sr.ID != "" && sr.Status.State == "active" -}