Skip to content

Commit

Permalink
Clean up some rules a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Nov 5, 2024
1 parent fbc9c8d commit 8944072
Show file tree
Hide file tree
Showing 11 changed files with 204 additions and 191 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `LoggingAgentDown` to be alerted when the logging agent is down.
- `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
- `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
- `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.

### Changed

- Update `DeploymentNotSatisfiedAtlas` to take into account the following components:
- `observability-operator`
- `alloy-rules`
- `observability-gateway`
- Move all `grafana-cloud` related alerts to their own file.
- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.

## [4.23.0] - 2024-10-30

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
name: alloy.rules
namespace: {{ .Values.namespace }}
namespace: {{ .Values.namespace }}
spec:
groups:
# List of alerts for on the state of the alloy components.
Expand Down Expand Up @@ -48,7 +48,24 @@ spec:
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
- name: logging-agent
- name: alloy.rules
rules:
- alert: AlloyForPrometheusRulesDown
annotations:
description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.'
opsrecipe: prometheus-rules/
expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- name: alloy.logs
rules:
# This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
# and join the pods with the not running containers
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,35 @@
{{- if .Values.mimir.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
name: mimir-to-grafana-cloud-exporter.rules
namespace: {{ .Values.namespace }}
{{- if not .Values.mimir.enabled }}
cluster_type: "management_cluster"
{{- end }}
name: grafana-cloud.recording.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: grafana-cloud
rules:
## Pages Atlas when prometheus fails to send samples to cortex
- alert: PrometheusMissingGrafanaCloud
annotations:
description: 'Prometheus is not sending data to Grafana Cloud.'
opsrecipe: prometheus-grafanacloud/
{{- if .Values.mimir.enabled }}
expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
{{- else }}
expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
{{- end }}
for: 1h
labels:
area: platform
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
{{- if .Values.mimir.enabled }}
- name: mimir-to-grafana-cloud-exporter
rules:
- alert: MimirToGrafanaCloudExporterDown
Expand Down Expand Up @@ -73,4 +95,4 @@ spec:
severity: page
team: atlas
topic: observability
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
{{- if not .Values.mimir.enabled }}
{{- if not .Values.mimir.enabled }}
cluster_type: "management_cluster"
{{- end }}
{{- end }}
name: grafana.rules
namespace: {{ .Values.namespace }}
spec:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ spec:
severity: page
team: atlas
topic: observability

- alert: KubeConfigMapCreatedMetricMissing
annotations:
description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,6 @@ spec:
severity: page
team: atlas
topic: observability
- alert: AlloyForPrometheusRulesDown
annotations:
description: 'Alloy sending PrometheusRules to Mimir ruler is down.'
opsrecipe: prometheus-rules/
expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: MimirRulerEventsFailed
annotations:
dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
name: prometheus.rules
Expand All @@ -27,23 +26,6 @@ spec:
severity: page
team: atlas
topic: observability
## Pages Atlas when prometheus fails to send samples to cortex
- alert: PrometheusMissingGrafanaCloud
annotations:
description: 'Prometheus is not sending data to Grafana Cloud.'
opsrecipe: prometheus-grafanacloud/
{{- if .Values.mimir.enabled }}
expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
{{- else }}
expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
{{- end }}
for: 1h
labels:
area: platform
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
annotations:
description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
rule_files:
- mimir-to-grafana-cloud-exporter.rules.yml
- grafana-cloud.rules.yml

tests:
# Tests for `MimirToGrafanaCloudExporterDown` alert
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,35 +86,6 @@ tests:
dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
description: "Mimir component : mimir-ingester is down."
opsrecipe: "mimir/"
- interval: 1m
input_series:
# test with 1 pod: none, up, down
- series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}'
values: "_x20 1+0x70 0+0x70"
alert_rule_test:
- alertname: AlloyForPrometheusRulesDown
eval_time: 10m
- alertname: AlloyForPrometheusRulesDown
eval_time: 80m
- alertname: AlloyForPrometheusRulesDown
eval_time: 160m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
provider: capa
pipeline: testing
severity: page
team: atlas
topic: observability
exp_annotations:
description: "Alloy sending PrometheusRules to Mimir ruler is down."
opsrecipe: "prometheus-rules/"
- interval: 1m
input_series:
# test: none, rate > 0, rate = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,157 @@ tests:
summary: "Unhealthy components detected."
- alertname: AlloyUnhealthyComponents
eval_time: 80m

# Test AlloyForPrometheusRulesDown
- interval: 1m
input_series:
# test with 1 pod: none, up, down
- series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}'
values: "_x20 1+0x70 0+0x70"
alert_rule_test:
- alertname: AlloyForPrometheusRulesDown
eval_time: 10m
- alertname: AlloyForPrometheusRulesDown
eval_time: 80m
- alertname: AlloyForPrometheusRulesDown
eval_time: 160m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
provider: capa
pipeline: testing
severity: page
team: atlas
topic: observability
exp_annotations:
description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down."
opsrecipe: "prometheus-rules/"

# Test LoggingAgentDown
- interval: 1m
input_series:
# For the first 60min: test with 1 pod: none, up, down
- series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}'
values: "_x20 1+0x20 0+0x40"
- series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
values: "1x180"
# From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
- series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}'
values: "_x80 1+0x40 1+0x20 0+0x40"
- series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
values: "1x180"
- series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}'
values: "_x80 0+0x40 1+0x20 0+0x40"
- series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
values: "1x180"
alert_rule_test:
- alertname: LoggingAgentDown
eval_time: 10m
- alertname: LoggingAgentDown
eval_time: 30m
- alertname: LoggingAgentDown
eval_time: 71m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
cluster_id: gauss
cluster_type: management_cluster
installation: gauss
node: ip-10-0-5-1.eu-west-1.compute.internal
pipeline: testing
pod: alloy-logs-1xxxx
provider: aws
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
# Tests with 2 pods
- alertname: LoggingAgentDown
eval_time: 111m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
cluster_id: gauss
cluster_type: management_cluster
installation: gauss
node: ip-10-0-5-3.eu-west-1.compute.internal
pipeline: testing
pod: alloy-logs-3xxxx
provider: aws
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
- alertname: LoggingAgentDown
eval_time: 121m
- alertname: LoggingAgentDown
eval_time: 180m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
cluster_id: gauss
cluster_type: management_cluster
installation: gauss
node: ip-10-0-5-2.eu-west-1.compute.internal
pipeline: testing
pod: alloy-logs-2xxxx
provider: aws
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_node_unschedulable: "true"
cancel_if_node_not_ready: "true"
cluster_id: gauss
cluster_type: management_cluster
installation: gauss
node: ip-10-0-5-3.eu-west-1.compute.internal
pipeline: testing
pod: alloy-logs-3xxxx
provider: aws
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
opsrecipe: "alloy/"
Loading

0 comments on commit 8944072

Please sign in to comment.