Skip to content

Commit

Permalink
feature: added alert manager client and flags
Browse files Browse the repository at this point in the history
	Signed-off-by: Alexei Tighineanu <[email protected]>
  • Loading branch information
atighineanu committed Mar 4, 2024
1 parent d0bdc11 commit 7779ec1
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 6 deletions.
19 changes: 13 additions & 6 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,13 @@ var (
postRebootNodeLabels []string
nodeID string
concurrency int

rebootDays []string
rebootStart string
rebootEnd string
timezone string
annotateNodes bool
alertManagerURL string
alertManagerToken string
rebootDays []string
rebootStart string
rebootEnd string
timezone string
annotateNodes bool

// Metrics
rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand Down Expand Up @@ -207,6 +208,11 @@ func NewRootCommand() *cobra.Command {
rootCmd.PersistentFlags().StringVar(&messageTemplateReboot, "message-template-reboot", "Rebooting node %s",
"message template used to notify about a node being rebooted")

rootCmd.PersistentFlags().StringVar(&alertManagerURL, "alert-manager-url", "",
"alertmanager URL for getting silencers")
rootCmd.PersistentFlags().StringVar(&alertManagerToken, "alert-manager-token", "",
"alertmanager token for authenticating")

rootCmd.PersistentFlags().StringArrayVar(&podSelectors, "blocking-pod-selector", nil,
"label selector identifying pods whose presence should prevent reboots")

Expand Down Expand Up @@ -387,6 +393,7 @@ func (pb PrometheusBlockingChecker) isBlocked() bool {
if count > 10 {
alertNames = append(alertNames[:10], "...")
}

if count > 0 {
log.Warnf("Reboot blocked: %d active alerts: %v", count, alertNames)
return true
Expand Down
2 changes: 2 additions & 0 deletions kured-ds-signal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,5 @@ spec:
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
# - --alert-manager-url=""
# - --alert-manager-token=""
119 changes: 119 additions & 0 deletions pkg/alerts/alertmanager/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package alertmanager

import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/url"
"path"
"time"
)

const (
alertManagerPathPrefix = "/api/v2"
// the default context timeout for alert manager client
// feel free to change this value/set a corresponding env var if needed
defaultTimeOut = 10 * time.Second
)

// New is a constructor of AlertManagerClient
//
// if no url flag is given => error
func New(alertManagerURL, alertManagerToken string) (*Client, error) {
if alertManagerURL == "" {
return nil, fmt.Errorf("no alert manager url found")
}
return &Client{
Token: alertManagerToken,
HostURL: alertManagerURL,
Client: new(http.Client),
}, nil
}

// Status builds the Status endpoint
func (c *Client) Status() *StatusEndpoint {
return &StatusEndpoint{
Client: *c,
}
}

// Silences builds the Silences endpoint
func (c *Client) Silences() *SilencesEndpoint {
return &SilencesEndpoint{
Client: *c,
}
}

// BuildURL builds the full URL for Status Endpoint
func (s *StatusEndpoint) BuildURL() error {
url, err := url.Parse(s.HostURL)
if err != nil {
return err
}
url.Path = path.Join(alertManagerPathPrefix, "status")
s.FullURL = url.String()
return nil
}

// Get receives information about alert manager overall status
func (s *StatusEndpoint) Get() (*StatusResponse, error) {
err := s.BuildURL()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeOut)
defer cancel()
request, err := http.NewRequestWithContext(ctx, http.MethodGet, s.FullURL, nil)
if err != nil {
return nil, err
}
response, err := s.Client.Client.Do(request)
if err != nil {
return nil, err
}
responseObject := new(StatusResponse)
err = json.NewDecoder(response.Body).Decode(responseObject)
if err != nil {
return nil, err
}
return responseObject, nil
}

// BuildURL builds the full URL for silences Endpoint
func (s *SilencesEndpoint) BuildURL() error {
url, err := url.Parse(s.HostURL)
if err != nil {
return err
}
url.Path = path.Join(alertManagerPathPrefix, "silences")
s.FullURL = url.String()
return nil
}

// Get lists the silences
func (s *SilencesEndpoint) Get() ([]GettableSilence, error) {
err := s.BuildURL()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeOut)
defer cancel()
request, err := http.NewRequestWithContext(ctx, http.MethodGet, s.FullURL, nil)
if err != nil {
return nil, err
}
response, err := s.Client.Client.Do(request)
if err != nil {
return nil, err
}
responseObject := make([]GettableSilence, 0)
err = json.NewDecoder(response.Body).Decode(&responseObject)
if err != nil {
return nil, err
}
if err := ValidateStatus(responseObject); err != nil {
return nil, err
}
return responseObject, nil
}
95 changes: 95 additions & 0 deletions pkg/alerts/alertmanager/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package alertmanager

import (
"fmt"
"net/http"
)

var (
silenceStates = map[string]bool{"expired": true, "active": true, "pending": true}
)

// Client is the object of the alert manager client
type Client struct {
Token string `json:"token" yaml:"token"`
HostURL string `json:"hostUrl" yaml:"hostUrl"`
Client *http.Client `json:"client" yaml:"client"`
}

// StatusEndpoint is the status enpoint of the alert manager client
type StatusEndpoint struct {
Client `json:"alertmanagerClient" yaml:"alertmanagerClient"`
FullURL string `json:"fullUrl" yaml:"fullUrl"`
}

// SilencesEndpoint is the silences enpoint of the alert manager client
type SilencesEndpoint struct {
Client `json:"alertmanagerClient" yaml:"alertmanagerClient"`
FullURL string `json:"fullUrl" yaml:"fullUrl"`
}

// StatusResponse is the object returned when sending GET $(host_url)$(path_prefix)/status request
type StatusResponse struct {
Cluster ClusterStatus `json:"cluster" yaml:"cluster"`
VersionInfo VersionInfo `json:"versionInfo" yaml:"versionInfo"`
Config Config `json:"alertmanagerConfig" yaml:"alertmanagerConfig"`
Uptime string `json:"uptime" yaml:"uptime"`
}

// ClusterStatus is the status of the cluster
type ClusterStatus struct {
Name string `json:"name" yaml:"name"`
Status string `json:"status" yaml:"status"`
Peers []PeerStatus `json:"peers" yaml:"peers"`
}

// PeerStatus is part of get status response
type PeerStatus struct {
Name string `json:"name" yaml:"name"`
Address string `json:"address" yaml:"address"`
}

// VersionInfo contains various go and alert manager version info
type VersionInfo struct {
Version string `json:"version" yaml:"version"`
Revision string `json:"revision" yaml:"revision"`
Branch string `json:"branch" yaml:"branch"`
BuildUser string `json:"buildUser" yaml:"buildUser"`
BuildData string `json:"buildData" yaml:"buildData"`
GoVersion string `json:"goVersion" yaml:"goVersion"`
}

// Config contains a string
type Config struct {
Original string `json:"original" yaml:"original"`
}

// GettableSilence is the response when sending GET $(host_url)$(path_prefix)/silences request
type GettableSilence struct {
ID string `json:"id" yaml:"id"`
Status SilenceStatus `json:"status" yaml:"status"`
UpdatedAt string `json:"updatedAt" yaml:"updatedAt"`
}

// SilenceStatus shows the state of the silence
type SilenceStatus struct {
State string `json:"state" yaml:"state"`
}

// Validate is validating if the status string corresponds to any of the pre-defined dict elements
func (s SilenceStatus) Validate() error {
if !silenceStates[s.State] {
return fmt.Errorf("such silence state does not exist: %s", s.State)
}
return nil
}

// ValidateStatus is checking the whole slice of GettableSilences if silence.status has the right values
func ValidateStatus(g []GettableSilence) error {
for _, silence := range g {
if err := silence.Status.Validate(); err != nil {
return err
}
}
return nil
}

0 comments on commit 7779ec1

Please sign in to comment.