From 1ff758d8aebf026b2a94c5b9e8dc43eff6f63e8b Mon Sep 17 00:00:00 2001 From: Franco <48300215+fhielpos@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:21:47 +0100 Subject: [PATCH] Add alert to monitor Shield pods restarts (#1438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add alert to monitor Trivy pod restarts * Make alert general for all Shield pods * Add more timeseries * Fix description * Try another timeseries * Check at 91m * Increase the whole time * Update test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml Co-authored-by: Hervé Nicol * Fix opsrecipe * Fix test labels --------- Co-authored-by: Hervé Nicol --- CHANGELOG.md | 2 ++ .../shield/alerting-rules/general.rules.yml | 27 +++++++++++++++ .../alerting-rules/general.rules.test.yml | 33 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml create mode 100644 test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index f07fc10c..05c06f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. +- Add alert to monitor Shield pods restarts. - Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules + ### Fixed - Fix dashboard link for `MimirContinuousTestFailing` alert diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml new file mode 100644 index 00000000..a16feb3a --- /dev/null +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: kyverno.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: general + rules: + - alert: ShieldComponentRestartingTooOften + annotations: + description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' + opsrecipe: shield-pod-failing + expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5 + for: 30m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: notify + team: shield + topic: security diff --git a/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml new file mode 100644 index 00000000..31da2b17 --- /dev/null +++ b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml @@ -0,0 +1,33 @@ +--- +rule_files: + - general.rules.yml +tests: + - interval: 1m + input_series: + # Kyverno validating webhooks + - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}' + values: "0+1x120" + alert_rule_test: + # Trivy pod + - alertname: ShieldComponentRestartingTooOften + eval_time: 91m + exp_alerts: + - exp_labels: + area: platform + cluster_id: golem + cluster_type: workload_cluster + installation: golem + pipeline: stable + provider: capa + severity: notify + team: shield + topic: security + namespace: security + pod: trivy-0 + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + exp_annotations: + description: 'Pod security/trivy-0 is restarting too often.' + opsrecipe: "shield-pod-failing"