add alerts for alloy-metrics

giantswarm · Nov 5, 2024 · 2f9c07c · 2f9c07c
1 parent 9e72664
commit 2f9c07c
Show file tree

Hide file tree

Showing 6 changed files with 266 additions and 76 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+  - `MonitoringAgentDown` to be alerted when the monitoring agent is down.
+  - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards.
 
 ### Changed
 
@@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-gateway`
 - Move all `grafana-cloud` related alerts to their own file.
 - Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.
+- Rename and move the following alerts as they are not specific to Prometheus:
+  - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure`
+  - `PrometheusJobScrapingFailure` => `JobScrapingFailure`
+  - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors`
 
 ## [4.23.0] - 2024-10-30
 

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -1,5 +1,5 @@
 # This files describe common alloy alerting rules
-# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml).
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -91,3 +91,103 @@ spec:
             cancel_if_cluster_status_updating: "true"
             cancel_if_node_unschedulable: "true"
             cancel_if_node_not_ready: "true"
+    - name: alloy.metrics
+      rules:
+        # This alert pages if monitoring-agent fails to send samples to its remote write endpoint.
+        - alert: MonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 20m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_has_no_workers: "true"
+        ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page.
+        - alert: InhibitionMonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+        ## This alert pages if any of the monitoring-agent shard is not running.
+        - alert: MonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Missing agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 40m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+        ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page.
+        - alert: InhibitionMonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Missing agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
@@ -0,0 +1,80 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: monitoring-pipeline.rules
+  namespace: {{ .Values.namespace }}
+spec:
+  groups:
+  - name: monitoring-pipeline
+    rules:
+    - alert: MetricForwardingErrors
+      annotations:
+        description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
+        opsrecipe: monitoring-pipeline/
+        dashboard: promRW001/prometheus-remote-write
+      expr: |-
+        rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1
+          or rate(prometheus_remote_storage_samples_total[10m]) == 0
+          or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+    - alert: JobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      expr: |-
+        (
+          count(up == 0) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(up) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 1d
+      labels:
+        area: platform
+        severity: notify
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+    - alert: CriticalJobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      ## We ignore bastion hosts node exporters
+      expr: |-
+        (
+          count(
+            (
+              up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+              or
+              up{job="kubelet", metrics_path="/metrics"}
+            ) == 0
+          ) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(
+            up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+            or
+            up{job="kubelet", metrics_path="/metrics"}
+          ) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 3d
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,3 +1,4 @@
+# TODO(@giantswarm/team-atlas): revisit once vintage is gone
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -26,19 +27,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
-      annotations:
-        description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
-        opsrecipe: prometheus-cant-communicate-with-remote-storage-api/
-        dashboard: promRW001/prometheus-remote-write
-      expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusRuleFailures
       annotations:
         description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}}
@@ -52,48 +40,3 @@ spec:
         team: atlas
         topic: observability
         cancel_if_outside_working_hours: "true"
-    - alert: PrometheusJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1
-      for: 1d
-      labels:
-        area: platform
-        severity: notify
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-    - alert: PrometheusCriticalJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      ## We ignore bastion hosts node exporters
-      expr: |-
-        (
-          count(
-            (
-              up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-              or
-              up{job="kubelet", metrics_path="/metrics"}
-            ) == 0
-          ) BY (job, installation, cluster_id, provider, pipeline)
-          /
-          count(
-            up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-            or
-            up{job="kubelet", metrics_path="/metrics"}
-          ) BY (job, installation, cluster_id, provider, pipeline)
-        ) == 1
-      for: 3d
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-        cancel_if_cluster_is_not_running_monitoring_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -86,6 +86,7 @@ tests:
         eval_time: 80m
       - alertname: AlloyForPrometheusRulesDown
         eval_time: 160m
+
         exp_alerts:
           - exp_labels:
               area: platform
@@ -226,3 +227,37 @@ tests:
               dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
               description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
               opsrecipe: "alloy/"
+
+  # Test MonitoringAgentDown
+  - interval: 1m
+    input_series:
+      - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x70 0+0x70"
+      - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "1x150"
+    alert_rule_test:
+      - alertname: MonitoringAgentDown
+        eval_time: 10m
+      - alertname: MonitoringAgentDown
+        eval_time: 80m
+      - alertname: MonitoringAgentDown
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_has_no_workers: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."