Skip to content

Commit

Permalink
add argument to invert the behavior of alert-filter-regexp (#786)
Browse files Browse the repository at this point in the history
* add argument to invert the behavior of alert-filter-regexp

Signed-off-by: Jim Liming <[email protected]>

* feat: small code-improvements

Signed-off-by: Christian Kotzbauer <[email protected]>

---------

Signed-off-by: Jim Liming <[email protected]>
Signed-off-by: Christian Kotzbauer <[email protected]>
Co-authored-by: Christian Kotzbauer <[email protected]>
  • Loading branch information
jimliming and ckotzbauer authored Aug 14, 2023
1 parent 3b9b190 commit 9a4b8fd
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 48 deletions.
10 changes: 7 additions & 3 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ var (
prometheusURL string
preferNoScheduleTaintName string
alertFilter *regexp.Regexp
alertFilterMatchOnly bool
alertFiringOnly bool
rebootSentinelFile string
rebootSentinelCommand string
Expand Down Expand Up @@ -158,6 +159,8 @@ func NewRootCommand() *cobra.Command {
"Prometheus instance to probe for active alerts")
rootCmd.PersistentFlags().Var(&regexpValue{&alertFilter}, "alert-filter-regexp",
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFilterMatchOnly, "alert-filter-match-only", false,
"Only block if the alert-filter-regexp matches active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
"only consider firing alerts when checking for active alerts")
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
Expand Down Expand Up @@ -358,6 +361,8 @@ type PrometheusBlockingChecker struct {
filter *regexp.Regexp
// bool to indicate if only firing alerts should be considered
firingOnly bool
// bool to indicate that we're only blocking on alerts which match the filter
filterMatchOnly bool
}

// KubernetesBlockingChecker contains info for connecting
Expand All @@ -371,8 +376,7 @@ type KubernetesBlockingChecker struct {
}

func (pb PrometheusBlockingChecker) isBlocked() bool {

alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly, pb.filterMatchOnly)
if err != nil {
log.Warnf("Reboot blocked: prometheus query error: %v", err)
return true
Expand Down Expand Up @@ -765,7 +769,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s

var blockCheckers []RebootBlocker
if prometheusURL != "" {
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly, filterMatchOnly: alertFilterMatchOnly})
}
if podSelectors != nil {
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
Expand Down
1 change: 1 addition & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ spec:
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-filter-match-only=false
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
Expand Down
12 changes: 10 additions & 2 deletions pkg/alerts/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func NewPromClient(conf papi.Config) (*PromClient, error) {
// filter by regexp means when the regex finds the alert-name; the alert is exluded from the
// block-list and will NOT block rebooting. query by includeLabel means,
// if the query finds an alert, it will include it to the block-list and it WILL block rebooting.
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]string, error) {
func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly, filterMatchOnly bool) ([]string, error) {

// get all alerts from prometheus
value, _, err := p.api.Query(context.Background(), "ALERTS", time.Now())
Expand All @@ -49,7 +49,7 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str
activeAlertSet := make(map[string]bool)
for _, sample := range vector {
if alertName, isAlert := sample.Metric[model.AlertNameLabel]; isAlert && sample.Value != 0 {
if (filter == nil || !filter.MatchString(string(alertName))) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
if matchesRegex(filter, string(alertName), filterMatchOnly) && (!firingOnly || sample.Metric["alertstate"] == "firing") {
activeAlertSet[string(alertName)] = true
}
}
Expand All @@ -67,3 +67,11 @@ func (p *PromClient) ActiveAlerts(filter *regexp.Regexp, firingOnly bool) ([]str

return nil, fmt.Errorf("Unexpected value type: %v", value)
}

func matchesRegex(filter *regexp.Regexp, alertName string, filterMatchOnly bool) bool {
if filter == nil {
return true
}

return filter.MatchString(string(alertName)) == filterMatchOnly
}
111 changes: 68 additions & 43 deletions pkg/alerts/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,62 +45,87 @@ func TestActiveAlerts(t *testing.T) {
addr := "http://localhost:10001"

for _, tc := range []struct {
it string
rFilter string
respBody string
aName string
wantN int
firingOnly bool
it string
rFilter string
respBody string
aName string
wantN int
firingOnly bool
filterMatchOnly bool
}{
{
it: "should return no active alerts",
respBody: responsebody,
rFilter: "",
wantN: 0,
firingOnly: false,
it: "should return no active alerts",
respBody: responsebody,
rFilter: "",
wantN: 0,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Pod",
wantN: 3,
firingOnly: false,
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Pod",
wantN: 3,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return all active alerts by regex",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
it: "should return a subset of all alerts",
respBody: responsebody,
rFilter: "Gatekeeper",
wantN: 1,
firingOnly: false,
filterMatchOnly: true,
},
{
it: "should return all active alerts by regex filter",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
it: "should return all active alerts by regex",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return only firing alerts if firingOnly is true",
respBody: responsebody,
rFilter: "*",
wantN: 4,
firingOnly: true,
it: "should return all active alerts by regex filter",
respBody: responsebody,
rFilter: "*",
wantN: 5,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should return ScheduledRebootFailing active alerts",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
aName: "ScheduledRebootFailing",
rFilter: "*",
wantN: 1,
firingOnly: false,
it: "should return only firing alerts if firingOnly is true",
respBody: responsebody,
rFilter: "*",
wantN: 4,
firingOnly: true,
filterMatchOnly: false,
},

{
it: "should return ScheduledRebootFailing active alerts",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"ScheduledRebootFailing","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
aName: "ScheduledRebootFailing",
rFilter: "*",
wantN: 1,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 0,
firingOnly: false,
filterMatchOnly: false,
},
{
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 0,
firingOnly: false,
it: "should not return an active alert if RebootRequired is firing (regex filter)",
respBody: `{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"ALERTS","alertname":"RebootRequired","alertstate":"pending","severity":"warning","team":"platform-infra"},"value":[1622472933.973,"1"]}]}}`,
rFilter: "RebootRequired",
wantN: 1,
firingOnly: false,
filterMatchOnly: true,
},
} {
// Start mockServer
Expand All @@ -125,7 +150,7 @@ func TestActiveAlerts(t *testing.T) {
log.Fatal(err)
}

result, err := p.ActiveAlerts(regex, tc.firingOnly)
result, err := p.ActiveAlerts(regex, tc.firingOnly, tc.filterMatchOnly)
if err != nil {
log.Fatal(err)
}
Expand Down

0 comments on commit 9a4b8fd

Please sign in to comment.