From a2d26bfdbf0dfb567ebf995d3233ae96c92d6a1c Mon Sep 17 00:00:00 2001 From: Eric Fried Date: Fri, 19 Feb 2021 16:33:08 -0600 Subject: [PATCH] Calculate maxHealthCheckDuration Calculate the maximum expected time it should take to run the full health check loop, and use both to initialize the silence and to extend it on any failure. --- README.md | 6 ++---- hack/deploy.sh | 6 +++++- main.go | 47 ++++++++++++++++++++++++++-------------------- silence/silence.go | 6 ++++-- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index bd154ea..9a15d76 100644 --- a/README.md +++ b/README.md @@ -15,15 +15,13 @@ This job silences alerts while Day2 configuration is loaded onto a cluster at initial provisioning, allowing it to not page on-call SREs for normal operations within the cluster. -The silence initially takes effect for 1 hour. - We poll cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211) once a minute (this is [configurable](#failed_check_interval_seconds)), until they all report healthy 20 times in a row ([configurable](#clean_check_runs)) on 30s intervals ([configurable](#clean_check_interval_seconds)). By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)). -If we think the silence might expire while health checks are running, we extend it. +We base the duration of the silence on the estimated maximum time we think it should take for the full health check sequence to run, extending it as necessary. ## Deploying @@ -122,5 +120,5 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit - [x] Look for existing active silences before creating a new one - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability - [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)). -- [ ] Find if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens. +- [ ] Find out if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens. - [ ] Make [tunables](#tunables) configurable via `make deploy`. diff --git a/hack/deploy.sh b/hack/deploy.sh index c3b9232..95f2e84 100755 --- a/hack/deploy.sh +++ b/hack/deploy.sh @@ -48,7 +48,11 @@ fi # Before deploying the new job, make sure the pod from the old one is gone if [[ $WAIT_FOR_POD == "yes" ]]; then - maybe oc wait --for=delete pod -l job-name=osd-cluster-ready --timeout=30s + # FIXME: This can fail for two reasons: + # - Timeout, in which case we want to blow up. + # - The pod already disappeared, in which case we want to proceed. + # Scraping the output is icky. For now, just ignore errors. + maybe oc wait --for=delete pod -l job-name=osd-cluster-ready --timeout=30s || true fi maybe oc create -f $TMP_MANIFEST diff --git a/main.go b/main.go index 8c1b001..16cfe5c 100644 --- a/main.go +++ b/main.go @@ -66,11 +66,15 @@ func main() { log.Fatal(err) } - // maxHealthCheckDurationMinutes is a heuristic for how long the full health check + // maxHealthCheckDuration is a heuristic for how long the full health check // loop should take. This is used to push out the expiry of active silences to make - // sure they don't expire while we're in that loop; so err on the high side. - // TODO: Use above vars to calculate this - maxHealthCheckDurationMinutes := 15 + // sure they don't expire while we're in that loop; so err on the high side: + // - One run of health checks is pretty fast -- anecdotally sub-second -- but give it + // ten seconds. + // - Add failedCheckInterval since we do that sleep *after* a failed health check. + // - Add another two minutes of fudge factor. + maxHealthCheckDuration := time.Duration(cleanCheckRuns * (cleanCheckInterval + 10) + failedCheckInterval + 120) * time.Second + log.Printf("Using a silence expiry window of %s.", maxHealthCheckDuration) for { @@ -98,10 +102,9 @@ func main() { // - This is the first iteration of the loop. The vast majority of the time // this is the first ever run of the job, and it's happening shortly after // the birth of the cluster. - // - The previous iteration took longer than maxHealthCheckDurationMinutes + // - The previous iteration took longer than maxHealthCheckDuration // and the previously-created silence expired. - // TODO: Use maxHealthCheckDurationMinutes instead of 1h. - _, err = silenceReq.Build(60 * time.Minute).Send() + _, err = silenceReq.Build(maxHealthCheckDuration).Send() if err != nil { log.Fatal(err) } @@ -109,14 +112,17 @@ func main() { } else { // If there is an existing silence, ensure it's not going to expire before // our healthchecks are complete. - if ok, _ := silenceReq.WillExpireBy(time.Duration(maxHealthCheckDurationMinutes) * time.Minute); ok { - log.Printf("Silence will expire within %d minutes", maxHealthCheckDurationMinutes) - log.Printf("Creating new silence") - _, err = silenceReq.Build(time.Duration(maxHealthCheckDurationMinutes) * time.Minute).Send() + willExpire, err := silenceReq.WillExpireBy(maxHealthCheckDuration) + if err != nil { + log.Fatal(err) + } + if willExpire { + log.Printf("Silence expires in less than %s. Extending.", maxHealthCheckDuration) + // Creating a new silence automatically expires the existing one(s). + _, err = silenceReq.Build(maxHealthCheckDuration).Send() if err != nil { log.Fatal(err) } - // TODO: Delete the old silence } } @@ -126,14 +132,15 @@ func main() { } if healthy { - if silenceReq.Active() { - // TODO: There might be more than one silence to remove - err = silenceReq.Remove() - if err != nil { - log.Fatal(err) - } - } else { - log.Println("Health checks passed and cluster was not silenced. Nothing to do here.") + // Use this to log how much time was left in the active silence. This can + // help us fine-tune maxHealthCheckDuration in the future. + // TODO: If this is negative, it has already expired and we don't have to + // remove it. But WillExpireBy would have to tell us that somehow. + _, err = silenceReq.WillExpireBy(maxHealthCheckDuration) + // TODO: There might be more than one silence to remove + err = silenceReq.Remove() + if err != nil { + log.Fatal(err) } os.Exit(0) } diff --git a/silence/silence.go b/silence/silence.go index d34f687..11b271a 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -78,7 +78,7 @@ func (sr *Request) FindExisting() (*Request, error) { } if !silence.Active() { - log.Printf("Silence is not active.") + log.Printf("Silence %s is not active.", silence.ID) continue } @@ -162,7 +162,9 @@ func (sr *Request) Remove() error { return fmt.Errorf("there was an error unsilencing the cluster") } -// WillExpireBy returns bool if the remaining time on the AlertManager Silence is less than the expiryPeriod +// WillExpireBy returns: +// - bool: true if the remaining time on the AlertManager Silence is less than the expiryPeriod. +// - error: nil unless the silence request's end time can't be parsed. func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) { // Parse end time of Alertmanager Silence end, err := time.Parse(time.RFC3339, sr.EndsAt)