Skip to content

Commit

Permalink
loki compactor: simplification
Browse files Browse the repository at this point in the history
  • Loading branch information
hervenicol committed Nov 15, 2024
1 parent 9b485cd commit c7d2f60
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 50 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- simplified loki-compactor alert

### Added

- Add new mimir continuous test alerts:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,24 +143,7 @@ spec:
description: 'Loki compactor has been failing compactions for more than 2 hours since last compaction.'
opsrecipe: loki#lokicompactorfailedcompaction
# This alert checks if Loki's the last successful compaction run is older than 2 hours
expr: (min by (cluster_id, installation, provider, pipeline) (time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds > 0)) > 60 * 60 * 2)
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: LokiCompactorFailedCompaction
annotations:
dashboard: loki-retention/loki-retention
description: 'Loki compactor has been failing compactions for more than 2 hours since start-up.'
opsrecipe: loki#lokicompactorfailedcompaction
# This alert covers the special case at compactor startup, where the "normal" alert would always consider time `0` is more than 2 hours ago, yet we want to let it 2 hours + `for` duration.
expr: max(max_over_time(loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[2h])) by (cluster_id, installation, provider, pipeline) == 0
expr: (min by (cluster_id, installation, provider, pipeline) (time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds)) > 60 * 60 * 2)
for: 1h
labels:
area: platform
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,12 @@ tests:
- interval: 1m
input_series:
- series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="golem", installation="golem", pipeline="testing", provider="capa"}'
values: "1x240 14400+60x100" # compactions worked once at the first second the does not work for the first 240 minutes so the timestamp stays still, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
values: "0x240 14400+60x100" # compactions worked once at the first second the does not work for the first 240 minutes so the timestamp stays still, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
alert_rule_test:
- alertname: LokiCompactorFailedCompaction
eval_time: 15m
- alertname: LokiCompactorFailedCompaction
eval_time: 90m
- alertname: LokiCompactorFailedCompaction
eval_time: 230m
exp_alerts:
Expand All @@ -259,37 +261,6 @@ tests:
- alertname: LokiCompactorFailedCompaction
eval_time: 300m

# Test for LokiCompactorFailedCompaction since start-up alert
- interval: 1m
input_series:
- series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="grizzly", installation="grizzly", pipeline="testing", provider="capz"}'
values: "0x240 14400+60x100" # compactions did not work since start-up for the first 240 minutes so the timestamp stays at 0, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
alert_rule_test:
- alertname: LokiCompactorFailedCompaction
eval_time: 15m
- alertname: LokiCompactorFailedCompaction
eval_time: 230m
exp_alerts:
- exp_labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: grizzly
installation: "grizzly"
pipeline: "testing"
provider: "capz"
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: loki-retention/loki-retention
description: Loki compactor has been failing compactions for more than 2 hours since start-up.
opsrecipe: "loki#lokicompactorfailedcompaction"
- alertname: LokiCompactorFailedCompaction
eval_time: 300m

# Test for LokiMissingLogs alert
- interval: 1m
input_series:
Expand Down

0 comments on commit c7d2f60

Please sign in to comment.