diff --git a/CHANGELOG.md b/CHANGELOG.md index 20d0e961..1cc78aad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. +- Add alert to monitor Shield pods restarts. - Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules + ### Fixed - Fix dashboard link for `MimirContinuousTestFailing` alert diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml new file mode 100644 index 00000000..a16feb3a --- /dev/null +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: kyverno.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: general + rules: + - alert: ShieldComponentRestartingTooOften + annotations: + description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' + opsrecipe: shield-pod-failing + expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5 + for: 30m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: notify + team: shield + topic: security diff --git a/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml new file mode 100644 index 00000000..31da2b17 --- /dev/null +++ b/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml @@ -0,0 +1,33 @@ +--- +rule_files: + - general.rules.yml +tests: + - interval: 1m + input_series: + # Kyverno validating webhooks + - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}' + values: "0+1x120" + alert_rule_test: + # Trivy pod + - alertname: ShieldComponentRestartingTooOften + eval_time: 91m + exp_alerts: + - exp_labels: + area: platform + cluster_id: golem + cluster_type: workload_cluster + installation: golem + pipeline: stable + provider: capa + severity: notify + team: shield + topic: security + namespace: security + pod: trivy-0 + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + exp_annotations: + description: 'Pod security/trivy-0 is restarting too often.' + opsrecipe: "shield-pod-failing"