Skip to content

Commit

Permalink
Make alert general for all Shield pods
Browse files Browse the repository at this point in the history
  • Loading branch information
fhielpos committed Nov 26, 2024
1 parent b40fa2e commit 26ff199
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ metadata:
namespace: {{ .Values.namespace }}
spec:
groups:
- name: trivy
- name: general
rules:
- alert: TrivyComponentRestartingTooOften
- alert: ShieldComponentRestartingTooOften
annotations:
description: 'Trivy pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often'
opsrecipe: trivy-pod-failing
expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*"}[1h]) > 5
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
opsrecipe: shield-pod-failing
expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5
for: 30m
labels:
area: platform
Expand All @@ -24,4 +24,4 @@ spec:
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: notify
team: shield
topic: trivy
topic: security
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
rule_files:
- kyverno.rules.yml
tests:
- interval: 1m
input_series:
# Kyverno validating webhooks
- series: 'kube_pod_container_status_restarts_total{app="kube-state-metrics", cluster_id="golem", cluster_type="workload_cluster", container="main", customer="giantswarm", endpoint="http", installation="golem", instance="100.94.2.68:8080", job="kube-state-metrics", namespace="security", node="master-0001", organization="giantswarm", pipeline="stable", pod="trivy-0", provider="capa", region="eu-west-1", service="kube-prometheus-stack-kube-state-metrics", service_priority="medium", uid="a38e8606-416d-4d6c-a7ea-2745078de330"}'
values: "1+1x10"
alert_rule_test:
# Trivy pod
- alertname: ShieldComponentRestartingTooOften
eval_time: 90m
exp_alerts:
- exp_labels:
area: platform
cluster_id: golem
installation: golem
pipeline: testing
provider: aws
severity: notify
team: shield
topic: security
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "false"
exp_annotations:
description: '{{`Pod security/trivy-0 is restarting too often.`}}'
opsrecipe: "shield-pod-failing/"

0 comments on commit 26ff199

Please sign in to comment.