loki compactor: simplification

giantswarm · Nov 15, 2024 · c7d2f60 · c7d2f60
1 parent 9b485cd
commit c7d2f60
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- simplified loki-compactor alert
+
 ### Added
 
 - Add new mimir continuous test alerts:

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
@@ -143,24 +143,7 @@ spec:
         description: 'Loki compactor has been failing compactions for more than 2 hours since last compaction.'
         opsrecipe: loki#lokicompactorfailedcompaction
       # This alert checks if Loki's the last successful compaction run is older than 2 hours
-      expr: (min by (cluster_id, installation, provider, pipeline) (time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds > 0)) > 60 * 60 * 2)
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
-    - alert: LokiCompactorFailedCompaction
-      annotations:
-        dashboard: loki-retention/loki-retention
-        description: 'Loki compactor has been failing compactions for more than 2 hours since start-up.'
-        opsrecipe: loki#lokicompactorfailedcompaction
-      # This alert covers the special case at compactor startup, where the "normal" alert would always consider time `0` is more than 2 hours ago, yet we want to let it 2 hours + `for` duration.
-      expr: max(max_over_time(loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[2h])) by (cluster_id, installation, provider, pipeline) == 0
+      expr: (min by (cluster_id, installation, provider, pipeline) (time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds)) > 60 * 60 * 2)
       for: 1h
       labels:
         area: platform

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
@@ -232,10 +232,12 @@ tests:
   - interval: 1m
     input_series:
       - series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="golem", installation="golem", pipeline="testing", provider="capa"}'
-        values: "1x240 14400+60x100" # compactions worked once at the first second the does not work for the first 240 minutes so the timestamp stays still, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
+        values: "0x240 14400+60x100" # compactions worked once at the first second the does not work for the first 240 minutes so the timestamp stays still, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
     alert_rule_test:
       - alertname: LokiCompactorFailedCompaction
         eval_time: 15m
+      - alertname: LokiCompactorFailedCompaction
+        eval_time: 90m
       - alertname: LokiCompactorFailedCompaction
         eval_time: 230m
         exp_alerts:
@@ -259,37 +261,6 @@ tests:
       - alertname: LokiCompactorFailedCompaction
         eval_time: 300m
 
-  # Test for LokiCompactorFailedCompaction since start-up alert
-  - interval: 1m
-    input_series:
-      - series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="grizzly", installation="grizzly", pipeline="testing", provider="capz"}'
-        values: "0x240 14400+60x100" # compactions did not work since start-up for the first 240 minutes so the timestamp stays at 0, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
-    alert_rule_test:
-      - alertname: LokiCompactorFailedCompaction
-        eval_time: 15m
-      - alertname: LokiCompactorFailedCompaction
-        eval_time: 230m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_outside_working_hours: "true"
-              cluster_id: grizzly
-              installation: "grizzly"
-              pipeline: "testing"
-              provider: "capz"
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              dashboard: loki-retention/loki-retention
-              description: Loki compactor has been failing compactions for more than 2 hours since start-up.
-              opsrecipe: "loki#lokicompactorfailedcompaction"
-      - alertname: LokiCompactorFailedCompaction
-        eval_time: 300m
-
   # Test for LokiMissingLogs alert
   - interval: 1m
     input_series: