diff --git a/cmd/kured/main.go b/cmd/kured/main.go index 3ad1cbee2..ca8cfa42e 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -83,12 +83,13 @@ var ( postRebootNodeLabels []string nodeID string concurrency int - - rebootDays []string - rebootStart string - rebootEnd string - timezone string - annotateNodes bool + alertManagerURL string + alertManagerToken string + rebootDays []string + rebootStart string + rebootEnd string + timezone string + annotateNodes bool // Metrics rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -207,6 +208,11 @@ func NewRootCommand() *cobra.Command { rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s", "message template used to notify about a node being rebooted") + rootCmd.PersistentFlags().StringVar(&alertManagerURL, "alert-manager-url", "", + "alertmanager URL for getting silencers") + rootCmd.PersistentFlags().StringVar(&alertManagerToken, "alert-manager-token", "", + "alertmanager token for authenticating") + rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil, "label selector identifying pods whose presence should prevent reboots") @@ -387,6 +393,7 @@ func (pb PrometheusBlockingChecker) isBlocked() bool { if count > 10 { alertNames = append(alertNames[:10], "...") } + if count > 0 { log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames) return true diff --git a/kured-ds-signal.yaml b/kured-ds-signal.yaml index 54568b670..834437660 100644 --- a/kured-ds-signal.yaml +++ b/kured-ds-signal.yaml @@ -98,3 +98,5 @@ spec: # - --annotate-nodes=false # - --lock-release-delay=30m # - --log-format=text +# - --alert-manager-url="" +# - --alert-manager-token="" diff --git a/pkg/alerts/alertmanager/client.go b/pkg/alerts/alertmanager/client.go new file mode 100644 index 000000000..1d5c19e22 --- /dev/null +++ b/pkg/alerts/alertmanager/client.go @@ -0,0 +1,121 @@ +package alertmanager + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "path" + "time" +) + +const ( + alertManagerPathPrefix = "/api/v2" + // the default context timeout for alert manager client + // feel free to change this value/set a corresponding env var if needed + defaultTimeOut = 10 * time.Second +) + +// New is a constructor of AlertManagerClient +// +// if no url flag is given => error +func New(alertManagerURL, alertManagerToken string) (*Client, error) { + if alertManagerURL == "" { + return nil, fmt.Errorf("no alert manager url found") + } + return &Client{ + Token: alertManagerToken, + HostURL: alertManagerURL, + Client: new(http.Client), + }, nil +} + +// Status builds the Status endpoint +func (c *Client) Status() *StatusEndpoint { + return &StatusEndpoint{ + Client: *c, + } +} + +// Silences builds the Silences endpoint +func (c *Client) Silences() *SilencesEndpoint { + return &SilencesEndpoint{ + Client: *c, + } +} + +// BuildURL builds the full URL for Status Endpoint +func (s *StatusEndpoint) BuildURL() error { + url, err := url.Parse(s.HostURL) + if err != nil { + return err + } + url.Path = path.Join(alertManagerPathPrefix, "status") + s.FullURL = url.String() + return nil +} + +// Get receives information about alert manager overall status +func (s *StatusEndpoint) Get() (*StatusResponse, error) { + err := s.BuildURL() + if err != nil { + return nil, err + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeOut) + defer cancel() + request, err := http.NewRequestWithContext(ctx, http.MethodGet, s.FullURL, nil) + if err != nil { + return nil, err + } + request.Header.Add("Authentication", fmt.Sprintf("Bearer %s", s.Token)) + response, err := s.Client.Client.Do(request) + if err != nil { + return nil, err + } + responseObject := new(StatusResponse) + err = json.NewDecoder(response.Body).Decode(responseObject) + if err != nil { + return nil, err + } + return responseObject, nil +} + +// BuildURL builds the full URL for silences Endpoint +func (s *SilencesEndpoint) BuildURL() error { + url, err := url.Parse(s.HostURL) + if err != nil { + return err + } + url.Path = path.Join(alertManagerPathPrefix, "silences") + s.FullURL = url.String() + return nil +} + +// Get lists the silences +func (s *SilencesEndpoint) Get() ([]GettableSilence, error) { + err := s.BuildURL() + if err != nil { + return nil, err + } + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeOut) + defer cancel() + request, err := http.NewRequestWithContext(ctx, http.MethodGet, s.FullURL, nil) + if err != nil { + return nil, err + } + request.Header.Add("Authentication", fmt.Sprintf("Bearer %s", s.Token)) + response, err := s.Client.Client.Do(request) + if err != nil { + return nil, err + } + responseObject := make([]GettableSilence, 0) + err = json.NewDecoder(response.Body).Decode(&responseObject) + if err != nil { + return nil, err + } + if err := ValidateStatus(responseObject); err != nil { + return nil, err + } + return responseObject, nil +} diff --git a/pkg/alerts/alertmanager/types.go b/pkg/alerts/alertmanager/types.go new file mode 100644 index 000000000..1b312d8b4 --- /dev/null +++ b/pkg/alerts/alertmanager/types.go @@ -0,0 +1,95 @@ +package alertmanager + +import ( + "fmt" + "net/http" +) + +var ( + silenceStates = map[string]bool{"expired": true, "active": true, "pending": true} +) + +// Client is the object of the alert manager client +type Client struct { + Token string `json:"token" yaml:"token"` + HostURL string `json:"hostUrl" yaml:"hostUrl"` + Client *http.Client `json:"client" yaml:"client"` +} + +// StatusEndpoint is the status enpoint of the alert manager client +type StatusEndpoint struct { + Client `json:"alertmanagerClient" yaml:"alertmanagerClient"` + FullURL string `json:"fullUrl" yaml:"fullUrl"` +} + +// SilencesEndpoint is the silences enpoint of the alert manager client +type SilencesEndpoint struct { + Client `json:"alertmanagerClient" yaml:"alertmanagerClient"` + FullURL string `json:"fullUrl" yaml:"fullUrl"` +} + +// StatusResponse is the object returned when sending GET $(host_url)$(path_prefix)/status request +type StatusResponse struct { + Cluster ClusterStatus `json:"cluster" yaml:"cluster"` + VersionInfo VersionInfo `json:"versionInfo" yaml:"versionInfo"` + Config Config `json:"alertmanagerConfig" yaml:"alertmanagerConfig"` + Uptime string `json:"uptime" yaml:"uptime"` +} + +// ClusterStatus is the status of the cluster +type ClusterStatus struct { + Name string `json:"name" yaml:"name"` + Status string `json:"status" yaml:"status"` + Peers []PeerStatus `json:"peers" yaml:"peers"` +} + +// PeerStatus is part of get status response +type PeerStatus struct { + Name string `json:"name" yaml:"name"` + Address string `json:"address" yaml:"address"` +} + +// VersionInfo contains various go and alert manager version info +type VersionInfo struct { + Version string `json:"version" yaml:"version"` + Revision string `json:"revision" yaml:"revision"` + Branch string `json:"branch" yaml:"branch"` + BuildUser string `json:"buildUser" yaml:"buildUser"` + BuildData string `json:"buildData" yaml:"buildData"` + GoVersion string `json:"goVersion" yaml:"goVersion"` +} + +// Config contains a string +type Config struct { + Original string `json:"original" yaml:"original"` +} + +// GettableSilence is the response when sending GET $(host_url)$(path_prefix)/silences request +type GettableSilence struct { + ID string `json:"id" yaml:"id"` + Status SilenceStatus `json:"status" yaml:"status"` + UpdatedAt string `json:"updatedAt" yaml:"updatedAt"` +} + +// SilenceStatus shows the state of the silence +type SilenceStatus struct { + State string `json:"state" yaml:"state"` +} + +// Validate is validating if the status string corresponds to any of the pre-defined dict elements +func (s SilenceStatus) Validate() error { + if !silenceStates[s.State] { + return fmt.Errorf("such silence state does not exist: %s", s.State) + } + return nil +} + +// ValidateStatus is checking the whole slice of GettableSilences if silence.status has the right values +func ValidateStatus(g []GettableSilence) error { + for _, silence := range g { + if err := silence.Status.Validate(); err != nil { + return err + } + } + return nil +}