diff --git a/CHANGELOG.md b/CHANGELOG.md index 251b6354..87a74d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Mimir compactor alert: better failure detection + ### Added - Add new mimir continuous test alerts: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 30b677d4..c025ce56 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -158,8 +158,24 @@ spec: dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources description: 'Mimir compactor has been failing its compactions for 2 hours.' opsrecipe: mimir#mimircompactorfailedcompaction - # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L858 - expr: sum(increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h])) by (cluster_id, installation, namespace, pipeline, provider) > 2 + expr: min by (cluster_id, installation, namespace, provider, pipeline) (time() - (cortex_compactor_last_successful_run_timestamp_seconds > 0 ) ) > 60 * 60 * 2 + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - alert: MimirCompactorFailedCompaction + annotations: + dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources + description: 'Mimir compactor has been failing compactions for more than 2 hours since start-up.' + opsrecipe: mimir#mimircompactorfailedcompaction + # This alert covers the special case at compactor startup, where the "normal" alert would always consider time `0` is more than 2 hours ago, yet we want to let it 2 hours + `for` duration. + expr: max(max_over_time(cortex_compactor_last_successful_run_timestamp_seconds{}[2h])) by (cluster_id, installation, namespace, provider, pipeline) == 0 + for: 2h labels: area: platform cancel_if_cluster_status_creating: "true" diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index e25ac35e..80ec37bc 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -360,13 +360,14 @@ tests: input_series: - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}' values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190" + - series: 'cortex_compactor_last_successful_run_timestamp_seconds{installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}' + # No compactions for 2 hours, then 1 successful one at t+3h, another one at t+4h, then 2 more hours with no successful compaction. + values: '0+0x240 14400+0x60 18000x60 21600+0x240' alert_rule_test: - alertname: MimirCompactorFailedCompaction - eval_time: 15m - - alertname: MimirCompactorFailedCompaction - eval_time: 55m + eval_time: 60m - alertname: MimirCompactorFailedCompaction - eval_time: 120m + eval_time: 130m exp_alerts: - exp_labels: area: platform @@ -384,12 +385,33 @@ tests: topic: observability exp_annotations: dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources - description: Mimir compactor has been failing its compactions for 2 hours. + description: Mimir compactor has been failing compactions for more than 2 hours since start-up. opsrecipe: "mimir#mimircompactorfailedcompaction" - alertname: MimirCompactorFailedCompaction - eval_time: 205m + eval_time: 250m - alertname: MimirCompactorFailedCompaction - eval_time: 350m + eval_time: 480m + - alertname: MimirCompactorFailedCompaction + eval_time: 600m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources + description: Mimir compactor has been failing its compactions for 2 hours. + opsrecipe: "mimir#mimircompactorfailedcompaction" # Test for MimirContinuousTestFailingOnWrites alert - interval: 1m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml index d6b37c8e..89d3ca7b 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -360,13 +360,14 @@ tests: input_series: - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capz"}' values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190" + - series: 'cortex_compactor_last_successful_run_timestamp_seconds{installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capz"}' + # No compactions for 2 hours, then 1 successful one at t+3h, another one at t+4h, then 2 more hours with no successful compaction. + values: '0+0x240 14400+0x60 18000x60 21600+0x240' alert_rule_test: - alertname: MimirCompactorFailedCompaction - eval_time: 15m - - alertname: MimirCompactorFailedCompaction - eval_time: 55m + eval_time: 60m - alertname: MimirCompactorFailedCompaction - eval_time: 120m + eval_time: 130m exp_alerts: - exp_labels: area: platform @@ -384,12 +385,33 @@ tests: topic: observability exp_annotations: dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources - description: Mimir compactor has been failing its compactions for 2 hours. + description: Mimir compactor has been failing compactions for more than 2 hours since start-up. opsrecipe: "mimir#mimircompactorfailedcompaction" - alertname: MimirCompactorFailedCompaction - eval_time: 205m + eval_time: 250m - alertname: MimirCompactorFailedCompaction - eval_time: 350m + eval_time: 480m + - alertname: MimirCompactorFailedCompaction + eval_time: 600m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capz" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources + description: Mimir compactor has been failing its compactions for 2 hours. + opsrecipe: "mimir#mimircompactorfailedcompaction" # Test for MimirContinuousTestFailingOnWrites alert - interval: 1m