From 1ff758d8aebf026b2a94c5b9e8dc43eff6f63e8b Mon Sep 17 00:00:00 2001
From: Franco <48300215+fhielpos@users.noreply.github.com>
Date: Tue, 26 Nov 2024 17:21:47 +0100
Subject: [PATCH] Add alert to monitor Shield pods restarts (#1438)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add alert to monitor Trivy pod restarts

* Make alert general for all Shield pods

* Add more timeseries

* Fix description

* Try another timeseries

* Check at 91m

* Increase the whole time

* Update test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Fix opsrecipe

* Fix test labels

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 CHANGELOG.md                                  |  2 ++
 .../shield/alerting-rules/general.rules.yml   | 27 +++++++++++++++
 .../alerting-rules/general.rules.test.yml     | 33 +++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml
 create mode 100644 test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f07fc10c..05c06f76 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,8 +12,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers.
 - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`.
 - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app.
+- Add alert to monitor Shield pods restarts.
 - Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules
 
+
 ### Fixed
 
 - Fix dashboard link for `MimirContinuousTestFailing` alert
diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml
new file mode 100644
index 00000000..a16feb3a
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml
@@ -0,0 +1,27 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: kyverno.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: general
+    rules:
+    - alert: ShieldComponentRestartingTooOften
+      annotations:
+        description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
+        opsrecipe: shield-pod-failing
+      expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5
+      for: 30m
+      labels:
+        area: platform
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
+        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
+        severity: notify
+        team: shield
+        topic: security
diff --git a/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml
new file mode 100644
index 00000000..31da2b17
--- /dev/null
+++ b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml
@@ -0,0 +1,33 @@
+---
+rule_files:
+  - general.rules.yml
+tests:
+  - interval: 1m
+    input_series:
+      # Kyverno validating webhooks
+      - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}'
+        values: "0+1x120"
+    alert_rule_test:
+      # Trivy pod
+      - alertname: ShieldComponentRestartingTooOften
+        eval_time: 91m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cluster_id: golem
+              cluster_type: workload_cluster
+              installation: golem
+              pipeline: stable
+              provider: capa
+              severity: notify
+              team: shield
+              topic: security
+              namespace: security
+              pod: trivy-0
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "false"
+            exp_annotations:
+              description: 'Pod security/trivy-0 is restarting too often.'
+              opsrecipe: "shield-pod-failing"