diff --git a/CHANGELOG.md b/CHANGELOG.md index 004e0a82..77be2a93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. +- Add alert to monitor `Trivy` pod restarts. ### Fixed diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/trivy.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/trivy.rules.yml new file mode 100644 index 00000000..9e8df83b --- /dev/null +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/trivy.rules.yml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: kyverno.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: trivy + rules: + - alert: TrivyComponentRestartingTooOften + annotations: + description: 'Trivy pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often' + opsrecipe: trivy-pod-failing + expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*"}[1h]) > 5 + for: 15m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: notify + team: shield + topic: trivy