Merge pull request openshift#6 from 2uasimojo/maxHealthCheckDuration

Calculate maxHealthCheckDuration
2uasimojo · Mar 8, 2021 · 6712aec · 6712aec
2 parents 47c9469 + a2d26bf
commit 6712aec
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -15,15 +15,13 @@
 
 This job silences alerts while Day2 configuration is loaded onto a cluster at initial provisioning, allowing it to not page on-call SREs for normal operations within the cluster.
 
-The silence initially takes effect for 1 hour.
-
 We poll cluster health using [osde2e health checks](https://github.com/openshift/osde2e/blob/041355675304a7aa371b7fbeea313001036feb75/pkg/common/cluster/clusterutil.go#L211)
 once a minute (this is [configurable](#failed_check_interval_seconds)),
 until they all report healthy 20 times in a row ([configurable](#clean_check_runs))
 on 30s intervals ([configurable](#clean_check_interval_seconds)).
 By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).
 
-If we think the silence might expire while health checks are running, we extend it.
+We base the duration of the silence on the estimated maximum time we think it should take for the full health check sequence to run, extending it as necessary.
 
 ## Deploying
 
@@ -122,5 +120,5 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit
 - [x] Look for existing active silences before creating a new one
 - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability
 - [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)).
-- [ ] Find if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens.
+- [ ] Find out if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens.
 - [ ] Make [tunables](#tunables) configurable via `make deploy`.
diff --git a/hack/deploy.sh b/hack/deploy.sh
@@ -48,7 +48,11 @@ fi
 
 # Before deploying the new job, make sure the pod from the old one is gone
 if [[ $WAIT_FOR_POD == "yes" ]]; then
-  maybe oc wait --for=delete pod -l job-name=osd-cluster-ready --timeout=30s
+  # FIXME: This can fail for two reasons:
+  # - Timeout, in which case we want to blow up.
+  # - The pod already disappeared, in which case we want to proceed.
+  # Scraping the output is icky. For now, just ignore errors.
+  maybe oc wait --for=delete pod -l job-name=osd-cluster-ready --timeout=30s || true
 fi
 
 maybe oc create -f $TMP_MANIFEST

diff --git a/main.go b/main.go
@@ -66,11 +66,15 @@ func main() {
 		log.Fatal(err)
 	}
 
-	// maxHealthCheckDurationMinutes is a heuristic for how long the full health check
+	// maxHealthCheckDuration is a heuristic for how long the full health check
 	// loop should take. This is used to push out the expiry of active silences to make
-	// sure they don't expire while we're in that loop; so err on the high side.
-	// TODO: Use above vars to calculate this
-	maxHealthCheckDurationMinutes := 15
+	// sure they don't expire while we're in that loop; so err on the high side:
+	// - One run of health checks is pretty fast -- anecdotally sub-second -- but give it
+	//   ten seconds.
+	// - Add failedCheckInterval since we do that sleep *after* a failed health check.
+	// - Add another two minutes of fudge factor.
+	maxHealthCheckDuration := time.Duration(cleanCheckRuns * (cleanCheckInterval + 10) + failedCheckInterval + 120) * time.Second
+	log.Printf("Using a silence expiry window of %s.", maxHealthCheckDuration)
 
 	for {
 
@@ -98,25 +102,27 @@ func main() {
 			// - This is the first iteration of the loop. The vast majority of the time
 			//   this is the first ever run of the job, and it's happening shortly after
 			//   the birth of the cluster.
-			// - The previous iteration took longer than maxHealthCheckDurationMinutes
+			// - The previous iteration took longer than maxHealthCheckDuration
 			//   and the previously-created silence expired.
-			// TODO: Use maxHealthCheckDurationMinutes instead of 1h.
-			_, err = silenceReq.Build(60 * time.Minute).Send()
+			_, err = silenceReq.Build(maxHealthCheckDuration).Send()
 			if err != nil {
 				log.Fatal(err)
 			}
 
 		} else {
 			// If there is an existing silence, ensure it's not going to expire before
 			// our healthchecks are complete.
-			if ok, _ := silenceReq.WillExpireBy(time.Duration(maxHealthCheckDurationMinutes) * time.Minute); ok {
-				log.Printf("Silence will expire within %d minutes", maxHealthCheckDurationMinutes)
-				log.Printf("Creating new silence")
-				_, err = silenceReq.Build(time.Duration(maxHealthCheckDurationMinutes) * time.Minute).Send()
+			willExpire, err := silenceReq.WillExpireBy(maxHealthCheckDuration)
+			if err != nil {
+				log.Fatal(err)
+			}
+			if willExpire {
+				log.Printf("Silence expires in less than %s. Extending.", maxHealthCheckDuration)
+				// Creating a new silence automatically expires the existing one(s).
+				_, err = silenceReq.Build(maxHealthCheckDuration).Send()
 				if err != nil {
 					log.Fatal(err)
 				}
-				// TODO: Delete the old silence
 			}
 		}
 
@@ -126,14 +132,15 @@ func main() {
 		}
 
 		if healthy {
-			if silenceReq.Active() {
-				// TODO: There might be more than one silence to remove
-				err = silenceReq.Remove()
-				if err != nil {
-					log.Fatal(err)
-				}
-			} else {
-				log.Println("Health checks passed and cluster was not silenced. Nothing to do here.")
+			// Use this to log how much time was left in the active silence. This can
+			// help us fine-tune maxHealthCheckDuration in the future.
+			// TODO: If this is negative, it has already expired and we don't have to
+			// remove it. But WillExpireBy would have to tell us that somehow.
+			_, err = silenceReq.WillExpireBy(maxHealthCheckDuration)
+			// TODO: There might be more than one silence to remove
+			err = silenceReq.Remove()
+			if err != nil {
+				log.Fatal(err)
 			}
 			os.Exit(0)
 		}

diff --git a/silence/silence.go b/silence/silence.go
@@ -78,7 +78,7 @@ func (sr *Request) FindExisting() (*Request, error) {
 			}
 
 			if !silence.Active() {
-				log.Printf("Silence is not active.")
+				log.Printf("Silence %s is not active.", silence.ID)
 				continue
 			}
 
@@ -162,7 +162,9 @@ func (sr *Request) Remove() error {
 	return fmt.Errorf("there was an error unsilencing the cluster")
 }
 
-// WillExpireBy returns bool if the remaining time on the AlertManager Silence is less than the expiryPeriod
+// WillExpireBy returns:
+// - bool: true if the remaining time on the AlertManager Silence is less than the expiryPeriod.
+// - error: nil unless the silence request's end time can't be parsed.
 func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) {
 	// Parse end time of Alertmanager Silence
 	end, err := time.Parse(time.RFC3339, sr.EndsAt)