Skip to content

Commit

Permalink
add MimirContinuousTestFailingOnWrites and MimirContinuousTestFailing… (
Browse files Browse the repository at this point in the history
#1355)

* add MimirContinuousTestFailingOnWrites and MimirContinuousTestFailingOnReads alerts

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml

* add tests for rules

* fix tests

* Update CHANGELOG.md

* add mimir continuous test missing alert

* add new test

* fix tests

---------

Co-authored-by: Quentin Bisson <[email protected]>
  • Loading branch information
QuantumEnigmaa and QuentinBisson authored Nov 15, 2024
1 parent de44204 commit 9b485cd
Show file tree
Hide file tree
Showing 4 changed files with 419 additions and 22 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Add new mimir continuous test alerts:
- `MimirContinuousTestFailingOnWrites`
- `MimirContinuousTestFailingOnReads`
- `MimirContinuousTestMissing`
- `MimirContinuousTestFailing`

### Removed

- Remove the `mimir.enabled` property to replace it with the MC flavor as all CAPI MCs now run Mimir.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,85 @@ spec:
severity: page
team: atlas
topic: observability
- name: mimir.continuous-test
rules:
- alert: MimirContinuousTestFailingOnWrites
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because writes are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1196
expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: MimirContinuousTestFailingOnReads
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1185
expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: MimirContinuousTestFailing
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1205
expr: sum by(cluster_id, installation, pipeline, provider, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: MimirContinuousTestMissing
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: '{{`Mimir continuous test {{ $labels.cluster_id }} is not producing metrics.`}}'
opsrecipe: mimir/
expr: |
sum by(cluster_id, installation, pipeline, provider) (
rate(mimir_continuous_test_writes_total[10m]) == 0
or absent(
mimir_continuous_test_writes_total{
cluster_type="management_cluster",
cluster_id="{{ .Values.managementCluster.name }}",
installation="{{ .Values.managementCluster.name }}",
provider="{{ .Values.managementCluster.provider.kind }}",
pipeline="{{ .Values.managementCluster.pipeline }}"
}
)
)
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,157 @@ tests:
eval_time: 205m
- alertname: MimirCompactorFailedCompaction
eval_time: 350m

# Test for MimirContinuousTestFailingOnWrites alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 40m
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because writes are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 160m

# Test for MimirContinuousTestFailingOnReads alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnReads
eval_time: 40m
- alertname: MimirContinuousTestFailingOnReads
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnReads
eval_time: 160m

# Test for MimirContinuousTestFailing alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailing
eval_time: 40m
- alertname: MimirContinuousTestFailing
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
topic: observability
test: continuous-test
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailing
eval_time: 160m

# Test for MimirContinuousTestMissing alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_writes_total{cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", pipeline="stable", provider="capa"}'
values: "_x80 1+1x80 0+0x80"
alert_rule_test:
- alertname: MimirContinuousTestMissing
eval_time: 40m
- alertname: MimirContinuousTestMissing
eval_time: 70m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: myinstall
installation: myinstall
pipeline: stable
provider: capa
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test myinstall is not producing metrics."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestMissing
eval_time: 150m
- alertname: MimirContinuousTestMissing
eval_time: 230m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: myinstall
installation: myinstall
pipeline: stable
provider: capa
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test myinstall is not producing metrics."
opsrecipe: "mimir/"
Loading

0 comments on commit 9b485cd

Please sign in to comment.