diff --git a/README.md b/README.md index da8c3aa..bd154ea 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,7 @@ until they all report healthy 20 times in a row ([configurable](#clean_check_run on 30s intervals ([configurable](#clean_check_interval_seconds)). By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)). -If the silence expires while health checks are failing, we reinstate it. -(This means it is theoretically possible for alerts to fire for up to one minute if the silence expires right after a health check fails. [FIXME](#to-do).) +If we think the silence might expire while health checks are running, we extend it. ## Deploying @@ -122,6 +121,6 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit - [x] Look for existing active silences before creating a new one - [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability +- [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)). - [ ] Find if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens. -- [ ] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)). -- [ ] Make [tunables](#tunables) configurable via `make deploy`. \ No newline at end of file +- [ ] Make [tunables](#tunables) configurable via `make deploy`. diff --git a/go.mod b/go.mod index c98cb85..bb38243 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/openshift/osd-cluster-ready-job +module github.com/openshift/osd-cluster-ready go 1.15 diff --git a/main.go b/main.go index 0fa2303..8c1b001 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,7 @@ import ( "github.com/openshift/osde2e/pkg/common/cluster" - "github.com/openshift/osd-cluster-ready-job/silence" + "github.com/openshift/osd-cluster-ready/silence" ) const ( @@ -39,7 +39,7 @@ const ( func main() { // New Alert Manager Silence Request // TODO: Handle multiple silences being active - silenceReq := silence.NewSilenceRequest() + silenceReq := silence.New() clusterBirth, err := getClusterCreationTime() if err != nil { @@ -66,32 +66,23 @@ func main() { log.Fatal(err) } + // maxHealthCheckDurationMinutes is a heuristic for how long the full health check + // loop should take. This is used to push out the expiry of active silences to make + // sure they don't expire while we're in that loop; so err on the high side. + // TODO: Use above vars to calculate this + maxHealthCheckDurationMinutes := 15 + for { - // Technically we should check if silenceReq has an ID here, no need to look for another - // active silence + // TODO: If silenceReq has an ID here, no need to look for another active silence silenceReq, err := silenceReq.FindExisting() if err != nil { log.Fatal(err) } - // If there is an existing silence ensure its not going to expiry before - // our healthchecks are complete - // TODO: Use above vars to calculate healthcheck MAX duration - if silenceReq.ID != "" { - if ok, _ := silenceReq.WillExpireBy(15 * time.Minute); ok { - log.Printf("Silence will expire within 15 minutes") - log.Printf("Creating new silence") - _, err = silenceReq.Build(15 * time.Minute).Send() - if err != nil { - log.Fatal(err) - } - } - } - if clusterTooOld(clusterBirth, maxClusterAge) { log.Printf("Cluster is older than %d minutes. Exiting Cleanly.", maxClusterAge) - if silenceReq.ID != "" { + if silenceReq.Active() { // TODO: There might be more than one silence to remove err = silenceReq.Remove() if err != nil { @@ -101,14 +92,32 @@ func main() { os.Exit(0) } - // If there are no existing silences create one for 1 hour - // TODO: Do we need to create a silence for an hour or can we just create - // a silence that should out live the healthchecks? - if silenceReq.ID == "" { + if !silenceReq.Active() { + // If there are no existing active silences, create one. + // We only get here in two scenarios: + // - This is the first iteration of the loop. The vast majority of the time + // this is the first ever run of the job, and it's happening shortly after + // the birth of the cluster. + // - The previous iteration took longer than maxHealthCheckDurationMinutes + // and the previously-created silence expired. + // TODO: Use maxHealthCheckDurationMinutes instead of 1h. _, err = silenceReq.Build(60 * time.Minute).Send() if err != nil { log.Fatal(err) } + + } else { + // If there is an existing silence, ensure it's not going to expire before + // our healthchecks are complete. + if ok, _ := silenceReq.WillExpireBy(time.Duration(maxHealthCheckDurationMinutes) * time.Minute); ok { + log.Printf("Silence will expire within %d minutes", maxHealthCheckDurationMinutes) + log.Printf("Creating new silence") + _, err = silenceReq.Build(time.Duration(maxHealthCheckDurationMinutes) * time.Minute).Send() + if err != nil { + log.Fatal(err) + } + // TODO: Delete the old silence + } } healthy, err := isClusterHealthy(cleanCheckRuns, cleanCheckInterval) diff --git a/silence/silence.go b/silence/silence.go index 73fe394..d34f687 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -11,8 +11,8 @@ import ( const createdBy = "OSD Cluster Readiness Job" -// SilenceRequest represents a Alertmanager silence request object -type SilenceRequest struct { +// Request represents an Alertmanager silence request object +type Request struct { ID string `json:"id"` Status silenceState `json:"status"` Matchers []matcher `json:"matchers"` @@ -32,14 +32,15 @@ type matcher struct { IsRegex bool `json:"isRegex"` } -type getSilenceResponse []*SilenceRequest +type getSilenceResponse []*Request type silenceResponse struct { ID string `json:"silenceID"` } -func NewSilenceRequest() *SilenceRequest { - return &SilenceRequest{} +// New returns a new silence request object +func New() *Request { + return &Request{} } // FindExisting looks for an existing, active silence that was created by us. If found, @@ -47,7 +48,7 @@ func NewSilenceRequest() *SilenceRequest { // error condition. // TODO: Handle muliple silences being active, currently we'll return the first one we fine // that is active and created by `createdBy` -func (sr *SilenceRequest) FindExisting() (*SilenceRequest, error) { +func (sr *Request) FindExisting() (*Request, error) { for i := 1; i <= 300; i++ { // try once a second or so for 5-ish minutes log.Printf("Checking for silences") cmdstr := "oc exec -n openshift-monitoring alertmanager-main-0 -c alertmanager -- curl --silent localhost:9093/api/v2/silences -X GET" @@ -94,7 +95,7 @@ func (sr *SilenceRequest) FindExisting() (*SilenceRequest, error) { return sr, fmt.Errorf("unable to get a list of existing silences") } -func (sr *SilenceRequest) Build(expiryPeriod time.Duration) *SilenceRequest { +func (sr *Request) Build(expiryPeriod time.Duration) *Request { // Create the Silence now := time.Now().UTC() end := now.Add(expiryPeriod) @@ -113,7 +114,7 @@ func (sr *SilenceRequest) Build(expiryPeriod time.Duration) *SilenceRequest { return sr } -func (sr *SilenceRequest) Send() (*silenceResponse, error) { +func (sr *Request) Send() (*silenceResponse, error) { silenceJSON, err := json.Marshal(sr) if err != nil { @@ -143,7 +144,7 @@ func (sr *SilenceRequest) Send() (*silenceResponse, error) { } // Remove deletes the silence with the given sr.ID -func (sr *SilenceRequest) Remove() error { +func (sr *Request) Remove() error { log.Printf("Removing Silence %s\n", sr.ID) for i := 0; i < 5; i++ { // Attempt up to 5 times to unsilence the cluster @@ -162,7 +163,7 @@ func (sr *SilenceRequest) Remove() error { } // WillExpireBy returns bool if the remaining time on the AlertManager Silence is less than the expiryPeriod -func (sr *SilenceRequest) WillExpireBy(expiryPeriod time.Duration) (bool, error) { +func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) { // Parse end time of Alertmanager Silence end, err := time.Parse(time.RFC3339, sr.EndsAt) if err != nil { @@ -179,6 +180,6 @@ func (sr *SilenceRequest) WillExpireBy(expiryPeriod time.Duration) (bool, error) } // Active returns a bool if the silence is Active -func (sr *SilenceRequest) Active() bool { - return sr.Status.State == "active" +func (sr *Request) Active() bool { + return sr.ID != "" && sr.Status.State == "active" }