Skip to content

Commit

Permalink
fix monitoring agent down alert (#1422)
Browse files Browse the repository at this point in the history
* fix monitoring agent down alert

* add old tests back
  • Loading branch information
QuentinBisson authored Nov 12, 2024
1 parent bf0d4f5 commit ef8a8d6
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 156 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing.

## [4.24.0] - 2024-11-12

### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ spec:
"(.*)"
) == 1
) by (cluster_id, installation, pipeline, provider) > 0
unless on (cluster_id) (
count(up{job="alloy-metrics"} > 0) by (cluster_id)
unless on (cluster_id) (
count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id)
)
for: 20m
labels:
Expand Down Expand Up @@ -140,8 +140,8 @@ spec:
"(.*)"
) == 1
) by (cluster_id, installation, pipeline, provider) > 0
unless on (cluster_id) (
count(up{job="alloy-metrics"} > 0) by (cluster_id)
unless on (cluster_id) (
count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id)
)
for: 2m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ spec:
groups:
- name: prometheus-agent
rules:
{{- if not .Values.mimir.enabled }}
## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
- alert: PrometheusAgentFailing
annotations:
description: '{{`Prometheus agent remote write is failing.`}}'
summary: Prometheus agent fails to send samples to remote write endpoint.
opsrecipe: prometheus-agent/
dashboard: promRW001/prometheus-remote-write
{{- if not .Values.mimir.enabled }}
expr: |-
max_over_time(
sum by (cluster_type, cluster_id, installation, instance, service)
Expand All @@ -26,20 +26,6 @@ spec:
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
{{- else }}
expr: |-
(
label_replace(
capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
"cluster_id",
"$1",
"name",
"(.*)"
) == 1
) unless on (cluster_id) (
count(up{job="prometheus-agent"} > 0) by (cluster_id)
)
{{- end }}
for: 20m
labels:
area: platform
Expand All @@ -58,7 +44,6 @@ spec:
summary: Prometheus agent fails to send samples to remote write endpoint.
opsrecipe: prometheus-agent/
dashboard: promRW001/prometheus-remote-write
{{- if not .Values.mimir.enabled }}
expr: |-
max_over_time(
sum by (cluster_type, cluster_id, installation, instance, service)
Expand All @@ -68,20 +53,6 @@ spec:
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
{{- else }}
expr: |-
(
label_replace(
capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
"cluster_id",
"$1",
"name",
"(.*)"
) == 1
) unless on (cluster_id) (
count(up{job="prometheus-agent"} > 0) by (cluster_id)
)
{{- end }}
for: 2m
labels:
area: platform
Expand All @@ -92,6 +63,7 @@ spec:
cancel_if_cluster_is_not_running_monitoring_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
{{- end }}
## This alert pages if one of the prometheus-agent shard is not running.
- alert: PrometheusAgentShardsMissing
annotations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,128 +4,6 @@ rule_files:
- prometheus-agent.rules.yml

tests:
# Tests for `PrometheusAgentFailing` alert
- interval: 1m
input_series:
- series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}'
values: "_x60 0+0x60 1+0x60"
- series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}'
values: "1+0x180"
alert_rule_test:
- alertname: PrometheusAgentFailing
eval_time: 30m
exp_alerts:
- exp_labels:
area: platform
severity: page
team: atlas
topic: observability
inhibit_monitoring_agent_down: "true"
cancel_if_cluster_has_no_workers: "true"
cancel_if_cluster_is_not_running_monitoring_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cluster_id: "gauss"
cluster_type: "workload_cluster"
customer: "giantswarm"
installation: "myinstall"
name: "gauss"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
status: "True"
type: "ControlPlaneReady"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: InhibitionPrometheusAgentFailing
eval_time: 30m
exp_alerts:
- exp_labels:
area: platform
severity: none
team: atlas
topic: observability
inhibit_monitoring_agent_down: "true"
cancel_if_cluster_is_not_running_monitoring_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cluster_id: "gauss"
cluster_type: "workload_cluster"
customer: "giantswarm"
installation: "myinstall"
name: "gauss"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
status: "True"
type: "ControlPlaneReady"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailing
eval_time: 90m
exp_alerts:
- exp_labels:
area: platform
cluster_id: gauss
cluster_type: workload_cluster
severity: page
team: atlas
topic: observability
inhibit_monitoring_agent_down: "true"
installation: myinstall
cancel_if_cluster_has_no_workers: "true"
cancel_if_cluster_is_not_running_monitoring_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
customer: "giantswarm"
name: "gauss"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
status: "True"
type: "ControlPlaneReady"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: InhibitionPrometheusAgentFailing
eval_time: 90m
exp_alerts:
- exp_labels:
area: platform
cluster_id: gauss
cluster_type: workload_cluster
severity: none
team: atlas
topic: observability
inhibit_monitoring_agent_down: "true"
installation: myinstall
cancel_if_cluster_is_not_running_monitoring_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
customer: "giantswarm"
name: "gauss"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
status: "True"
type: "ControlPlaneReady"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailing
eval_time: 150m
- alertname: InhibitionPrometheusAgentFailing
eval_time: 150m
# Tests for `PrometheusAgentShardsMissing` alert
- interval: 1m
input_series:
Expand Down

0 comments on commit ef8a8d6

Please sign in to comment.