Skip to content

Commit

Permalink
Minor refactoring
Browse files Browse the repository at this point in the history
- Do maxClusterAge check first.
- Pull maxHealthCheckDurationMinutes into a variable for easier changing
later when we want to calculate it.
- Start cleaning up silence module to allow crisper public/private
boundary. (More work to do here.)
- Change module name from osd-cluster-ready-job to osd-cluster-ready
(left over from iamkirbater origins).
- Rename some things in the 'silence' module to be less stuttery.
- Correct the README wrt extending silences (missed in openshift#2).
- Fix some comment typos. Add & flesh out other comments.
  • Loading branch information
2uasimojo committed Feb 19, 2021
1 parent e083867 commit 9fe9510
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 40 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ until they all report healthy 20 times in a row ([configurable](#clean_check_run
on 30s intervals ([configurable](#clean_check_interval_seconds)).
By default, we will clear any active silence and exit successfully if the cluster is (or becomes) more than two hours old ([configurable](#max_cluster_age_minutes)).

If the silence expires while health checks are failing, we reinstate it.
(This means it is theoretically possible for alerts to fire for up to one minute if the silence expires right after a health check fails. [FIXME](#to-do).)
If we think the silence might expire while health checks are running, we extend it.

## Deploying

Expand Down Expand Up @@ -122,6 +121,6 @@ Don't forget to [build](#deploying-the-image) and [test](#deploying-the-job) wit

- [x] Look for existing active silences before creating a new one
- [x] Implement _actual_ healthchecks (steal them from osde2e) to determine cluster stability
- [x] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)).
- [ ] Find if there's a better and more secure way to talk to the alertmanager API using oauth and serviceaccount tokens.
- [ ] Make the default silence expiry shorter; and extend it when health checks fail ([OSD-6384](https://issues.redhat.com/browse/OSD-6384)).
- [ ] Make [tunables](#tunables) configurable via `make deploy`.
- [ ] Make [tunables](#tunables) configurable via `make deploy`.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/openshift/osd-cluster-ready-job
module github.com/openshift/osd-cluster-ready

go 1.15

Expand Down
55 changes: 32 additions & 23 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (

"github.com/openshift/osde2e/pkg/common/cluster"

"github.com/openshift/osd-cluster-ready-job/silence"
"github.com/openshift/osd-cluster-ready/silence"
)

const (
Expand Down Expand Up @@ -39,7 +39,7 @@ const (
func main() {
// New Alert Manager Silence Request
// TODO: Handle multiple silences being active
silenceReq := silence.NewSilenceRequest()
silenceReq := silence.New()

clusterBirth, err := getClusterCreationTime()
if err != nil {
Expand All @@ -66,32 +66,23 @@ func main() {
log.Fatal(err)
}

// maxHealthCheckDurationMinutes is a heuristic for how long the full health check
// loop should take. This is used to push out the expiry of active silences to make
// sure they don't expire while we're in that loop; so err on the high side.
// TODO: Use above vars to calculate this
maxHealthCheckDurationMinutes := 15

for {

// Technically we should check if silenceReq has an ID here, no need to look for another
// active silence
// TODO: If silenceReq has an ID here, no need to look for another active silence
silenceReq, err := silenceReq.FindExisting()
if err != nil {
log.Fatal(err)
}

// If there is an existing silence ensure its not going to expiry before
// our healthchecks are complete
// TODO: Use above vars to calculate healthcheck MAX duration
if silenceReq.ID != "" {
if ok, _ := silenceReq.WillExpireBy(15 * time.Minute); ok {
log.Printf("Silence will expire within 15 minutes")
log.Printf("Creating new silence")
_, err = silenceReq.Build(15 * time.Minute).Send()
if err != nil {
log.Fatal(err)
}
}
}

if clusterTooOld(clusterBirth, maxClusterAge) {
log.Printf("Cluster is older than %d minutes. Exiting Cleanly.", maxClusterAge)
if silenceReq.ID != "" {
if silenceReq.Active() {
// TODO: There might be more than one silence to remove
err = silenceReq.Remove()
if err != nil {
Expand All @@ -101,14 +92,32 @@ func main() {
os.Exit(0)
}

// If there are no existing silences create one for 1 hour
// TODO: Do we need to create a silence for an hour or can we just create
// a silence that should out live the healthchecks?
if silenceReq.ID == "" {
if !silenceReq.Active() {
// If there are no existing active silences, create one.
// We only get here in two scenarios:
// - This is the first iteration of the loop. The vast majority of the time
// this is the first ever run of the job, and it's happening shortly after
// the birth of the cluster.
// - The previous iteration took longer than maxHealthCheckDurationMinutes
// and the previously-created silence expired.
// TODO: Use maxHealthCheckDurationMinutes instead of 1h.
_, err = silenceReq.Build(60 * time.Minute).Send()
if err != nil {
log.Fatal(err)
}

} else {
// If there is an existing silence, ensure it's not going to expire before
// our healthchecks are complete.
if ok, _ := silenceReq.WillExpireBy(time.Duration(maxHealthCheckDurationMinutes) * time.Minute); ok {
log.Printf("Silence will expire within %d minutes", maxHealthCheckDurationMinutes)
log.Printf("Creating new silence")
_, err = silenceReq.Build(time.Duration(maxHealthCheckDurationMinutes) * time.Minute).Send()
if err != nil {
log.Fatal(err)
}
// TODO: Delete the old silence
}
}

healthy, err := isClusterHealthy(cleanCheckRuns, cleanCheckInterval)
Expand Down
25 changes: 13 additions & 12 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import (

const createdBy = "OSD Cluster Readiness Job"

// SilenceRequest represents a Alertmanager silence request object
type SilenceRequest struct {
// Request represents an Alertmanager silence request object
type Request struct {
ID string `json:"id"`
Status silenceState `json:"status"`
Matchers []matcher `json:"matchers"`
Expand All @@ -32,22 +32,23 @@ type matcher struct {
IsRegex bool `json:"isRegex"`
}

type getSilenceResponse []*SilenceRequest
type getSilenceResponse []*Request

type silenceResponse struct {
ID string `json:"silenceID"`
}

func NewSilenceRequest() *SilenceRequest {
return &SilenceRequest{}
// New returns a new silence request object
func New() *Request {
return &Request{}
}

// FindExisting looks for an existing, active silence that was created by us. If found,
// its ID is returned; otherwise the empty string is returned. The latter is not an
// error condition.
// TODO: Handle muliple silences being active, currently we'll return the first one we fine
// that is active and created by `createdBy`
func (sr *SilenceRequest) FindExisting() (*SilenceRequest, error) {
func (sr *Request) FindExisting() (*Request, error) {
for i := 1; i <= 300; i++ { // try once a second or so for 5-ish minutes
log.Printf("Checking for silences")
cmdstr := "oc exec -n openshift-monitoring alertmanager-main-0 -c alertmanager -- curl --silent localhost:9093/api/v2/silences -X GET"
Expand Down Expand Up @@ -94,7 +95,7 @@ func (sr *SilenceRequest) FindExisting() (*SilenceRequest, error) {
return sr, fmt.Errorf("unable to get a list of existing silences")
}

func (sr *SilenceRequest) Build(expiryPeriod time.Duration) *SilenceRequest {
func (sr *Request) Build(expiryPeriod time.Duration) *Request {
// Create the Silence
now := time.Now().UTC()
end := now.Add(expiryPeriod)
Expand All @@ -113,7 +114,7 @@ func (sr *SilenceRequest) Build(expiryPeriod time.Duration) *SilenceRequest {
return sr
}

func (sr *SilenceRequest) Send() (*silenceResponse, error) {
func (sr *Request) Send() (*silenceResponse, error) {

silenceJSON, err := json.Marshal(sr)
if err != nil {
Expand Down Expand Up @@ -143,7 +144,7 @@ func (sr *SilenceRequest) Send() (*silenceResponse, error) {
}

// Remove deletes the silence with the given sr.ID
func (sr *SilenceRequest) Remove() error {
func (sr *Request) Remove() error {
log.Printf("Removing Silence %s\n", sr.ID)
for i := 0; i < 5; i++ {
// Attempt up to 5 times to unsilence the cluster
Expand All @@ -162,7 +163,7 @@ func (sr *SilenceRequest) Remove() error {
}

// WillExpireBy returns bool if the remaining time on the AlertManager Silence is less than the expiryPeriod
func (sr *SilenceRequest) WillExpireBy(expiryPeriod time.Duration) (bool, error) {
func (sr *Request) WillExpireBy(expiryPeriod time.Duration) (bool, error) {
// Parse end time of Alertmanager Silence
end, err := time.Parse(time.RFC3339, sr.EndsAt)
if err != nil {
Expand All @@ -179,6 +180,6 @@ func (sr *SilenceRequest) WillExpireBy(expiryPeriod time.Duration) (bool, error)
}

// Active returns a bool if the silence is Active
func (sr *SilenceRequest) Active() bool {
return sr.Status.State == "active"
func (sr *Request) Active() bool {
return sr.ID != "" && sr.Status.State == "active"
}

0 comments on commit 9fe9510

Please sign in to comment.