Skip to content

Commit

Permalink
Fix atlas alerts with pint (#1173)
Browse files Browse the repository at this point in the history
* Fix atlas alerts with pint

* Fix remaining atlas alerts
  • Loading branch information
QuentinBisson authored May 14, 2024
1 parent 3d4a3f4 commit 3b56325
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 18 deletions.
4 changes: 2 additions & 2 deletions helm/prometheus-rules/templates/alerting-rules/app.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Management Cluster App {{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}'
opsrecipe: app-failed/
expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"(control-plane-.*|default)",team!~"^$|noteam", namespace=~".*giantswarm"}
expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"(control-plane-.*|default)",team!~"^$|noteam", namespace=~".*giantswarm"} == 1
for: 30m
labels:
area: managedservices
Expand All @@ -32,7 +32,7 @@ spec:
annotations:
description: 'Current version of {{`App {{ $labels.name }} is {{ $labels.deployed_version }} but it should be {{ $labels.version }}.`}}'
opsrecipe: app-pending-update/
expr: app_operator_app_info{catalog=~"(control-plane-.*|default)", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam", namespace=~".*giantswarm"}
expr: app_operator_app_info{catalog=~"(control-plane-.*|default)", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam", namespace=~".*giantswarm"} == 1
for: 40m
labels:
area: managedservices
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ spec:
annotations:
description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#etcd-volume
expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10
expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!="eks"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!="eks"} < 10
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
annotations:
description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
opsrecipe: low-disk-space/#etcd-volume
expr: 100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10
expr: 100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!="eks"} / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/etcd", provider!="eks"} < 10
for: 10m
labels:
area: kaas
Expand Down
10 changes: 5 additions & 5 deletions helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ spec:
team: atlas
topic: observability
- alert: GrafanaFolderPermissionsDown
# Monitors that folder permissions have been updated.
# Monitors that folder permissions have been updated at least once in the last 6 hours.
# We have a cronjob (grafana-permissions) that runs every 20 minutes.
# When successfully run, folders permissions successful updates counter increases.
annotations:
description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}'
description: '{{`Grafana Folder could not be updated.`}}'
opsrecipe: grafana-perms/
expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
for: 6h
Expand All @@ -47,11 +47,11 @@ spec:
team: atlas
topic: observability
- alert: GrafanaFolderPermissionsCronjobFails
# Monitors that folder permissions job has run successfully.
# Monitors that folder permissions job has run successfully at least once in the last 6 hours.
# We have a cronjob (grafana-permissions) that runs every 20 minutes.
# Here we check the kubernetes job status
annotations:
description: '{{`Grafana permissions updates cronjob failed for ({{ $labels.job_name }}).`}}'
description: '{{`Grafana permissions updates cronjob failed to run.`}}'
opsrecipe: grafana-perms/
# expression explanation:
# - we create cronjob label from cron name (label_replace)
Expand All @@ -75,7 +75,7 @@ spec:
# This alert triggers when the grafana permission job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400
or count by (cluster_id, installation, provider, pipeline) (max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
or count by (cluster_id, cronjob, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permissions-.+", cluster_type="management_cluster"}[1d]), "cronjob", "grafana-permissions", "job_name", "grafana-permissions-.+") == 1) == 0
labels:
area: empowerment
severity: page
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
rules:
- alert: KubeStateMetricsDown
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}'
description: '{{`KubeStateMetrics is down.`}}'
opsrecipe: kube-state-metrics-down/
{{- if not .Values.mimir.enabled }}
expr: |-
Expand Down Expand Up @@ -56,7 +56,7 @@ spec:
topic: observability
- alert: KubeStateMetricsSlow
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}'
description: '{{`KubeStateMetrics is too slow.`}}'
opsrecipe: kube-state-metrics-down/
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7
for: 15m
Expand All @@ -74,7 +74,7 @@ spec:
topic: observability
- alert: KubeStateMetricsNotRetrievingMetrics
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}'
description: '{{`KubeStateMetrics is not retrieving metrics.`}}'
opsrecipe: kube-state-metrics-down/
expr: |-
# When it looks up but we don't have metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"app-operator.*|chart-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"app-operator.*|chart-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1
for: 5m
labels:
area: managedservices
Expand All @@ -47,7 +47,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"aws-operator.*|cluster-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"aws-operator.*|cluster-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1
for: 5m
labels:
area: kaas
Expand All @@ -62,7 +62,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1
for: 5m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
- name: observability
rules:
- alert: "Heartbeat"
expr: up{app="prometheus",instance!="prometheus-agent"}
expr: up{app="prometheus",instance!="prometheus-agent"} == 1
labels:
area: "empowerment"
installation: {{ .Values.managementCluster.name }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ spec:
# This alert triggers when the silence operator sync job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) by (cluster_id, installation, provider, pipeline) == 0
or count by (cronjob, cluster_id, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync-.+", cluster_type="management_cluster"}[1d]), "cronjob", "silence-operator-sync", "job_name", "silence-operator-sync-.+") == 1) == 0
labels:
area: empowerment
severity: page
team: atlas
topic: managementcluster
topic: managementcluster

0 comments on commit 3b56325

Please sign in to comment.