Add alert to monitor Shield pods restarts (#1438)

* Add alert to monitor Trivy pod restarts * Make alert general for all Shield pods * Add more timeseries * Fix description * Try another timeseries * Check at 91m * Increase the whole time * Update test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml Co-authored-by: Hervé Nicol <[email protected]> * Fix opsrecipe * Fix test labels --------- Co-authored-by: Hervé Nicol <[email protected]>
giantswarm · Nov 26, 2024 · 1ff758d · 1ff758d
1 parent 4e41119
commit 1ff758d
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,8 +12,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers.
 - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`.
 - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app.
+- Add alert to monitor Shield pods restarts.
 - Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules
 
+
 ### Fixed
 
 - Fix dashboard link for `MimirContinuousTestFailing` alert

diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml
@@ -0,0 +1,27 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: kyverno.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: general
+    rules:
+    - alert: ShieldComponentRestartingTooOften
+      annotations:
+        description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
+        opsrecipe: shield-pod-failing
+      expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5
+      for: 30m
+      labels:
+        area: platform
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
+        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
+        severity: notify
+        team: shield
+        topic: security
diff --git a/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml
@@ -0,0 +1,33 @@
+---
+rule_files:
+  - general.rules.yml
+tests:
+  - interval: 1m
+    input_series:
+      # Kyverno validating webhooks
+      - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}'
+        values: "0+1x120"
+    alert_rule_test:
+      # Trivy pod
+      - alertname: ShieldComponentRestartingTooOften
+        eval_time: 91m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cluster_id: golem
+              cluster_type: workload_cluster
+              installation: golem
+              pipeline: stable
+              provider: capa
+              severity: notify
+              team: shield
+              topic: security
+              namespace: security
+              pod: trivy-0
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "false"
+            exp_annotations:
+              description: 'Pod security/trivy-0 is restarting too often.'
+              opsrecipe: "shield-pod-failing"