Skip to content

Commit

Permalink
fix(mixin): ThanosSidecarUnhealthy doesn't fire if the sidecar is nev…
Browse files Browse the repository at this point in the history
…er healthy (#4342)

* Revert "mixin: Use sidecar's metric timestamp for healthcheck (#3204) (#3979)"

This reverts commit 5139e33.

Signed-off-by: Arunprasad Rajkumar <[email protected]>

* fix(mixin): ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy

Signed-off-by: Arunprasad Rajkumar <[email protected]>
  • Loading branch information
arajkumar authored Jun 23, 2021
1 parent 7a90505 commit 8f50211
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 51 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

### Fixed

-
- [#4342](https://github.com/thanos-io/thanos/pull/4342) ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy

### Changed

Expand Down
3 changes: 2 additions & 1 deletion examples/alerts/alerts.md
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,8 @@ rules:
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy
summary: Thanos Sidecar is unhealthy.
expr: |
time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240
time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240
for: 5m
labels:
severity: critical
```
Expand Down
3 changes: 2 additions & 1 deletion examples/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,8 @@ groups:
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy
summary: Thanos Sidecar is unhealthy.
expr: |
time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240
time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240
for: 5m
labels:
severity: critical
- name: thanos-store
Expand Down
142 changes: 95 additions & 47 deletions examples/alerts/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ tests:
- interval: 1m
input_series:
- series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}'
values: '5 10 43 17 11 _x5 0x10'
values: '5 10 43 17 11 0 0 0'
- series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}'
values: '4 9 42 15 10 _x5 0x10'
values: '4 9 42 15 10 0 0 0'
promql_expr_test:
- expr: time()
eval_time: 1m
Expand All @@ -22,64 +22,112 @@ tests:
exp_samples:
- labels: '{}'
value: 120
- expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}))
eval_time: 5m
- expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
eval_time: 2m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 60
value: 43
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 60
- expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}))
eval_time: 6m
value: 42
- expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
eval_time: 10m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 120
value: 0
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 120
- expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}))
eval_time: 7m
value: 0
- expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
eval_time: 11m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 0
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 0
- expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
eval_time: 10m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 600
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 600
- expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
eval_time: 11m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 180
value: 660
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 180
- expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}))
eval_time: 8m
value: 660
- expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600
eval_time: 12m
exp_samples:
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
value: 240
value: 720
- labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
value: 240
value: 720
alert_rule_test:
- eval_time: 1m
alertname: ThanosSidecarUnhealthy
- eval_time: 2m
alertname: ThanosSidecarUnhealthy
- eval_time: 3m
alertname: ThanosSidecarUnhealthy
- eval_time: 5m
alertname: ThanosSidecarUnhealthy
- eval_time: 8m
alertname: ThanosSidecarUnhealthy
exp_alerts:
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- eval_time: 10m
alertname: ThanosSidecarUnhealthy
- eval_time: 1m
alertname: ThanosSidecarUnhealthy
- eval_time: 2m
alertname: ThanosSidecarUnhealthy
- eval_time: 3m
alertname: ThanosSidecarUnhealthy
- eval_time: 10m
alertname: ThanosSidecarUnhealthy
exp_alerts:
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 600 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 600 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- eval_time: 11m
alertname: ThanosSidecarUnhealthy
exp_alerts:
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 660 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 660 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- eval_time: 12m
alertname: ThanosSidecarUnhealthy
exp_alerts:
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 720 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 720 seconds.'
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- interval: 1m
input_series:
- series: 'prometheus_rule_evaluations_total{namespace="production", job="thanos-ruler", instance="thanos-ruler-0"}'
Expand Down
3 changes: 2 additions & 1 deletion mixin/alerts/sidecar.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@
summary: 'Thanos Sidecar is unhealthy.',
},
expr: |||
time() - max by (%(dimensions)s) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s})) >= 240
time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 240
||| % thanos.sidecar,
'for': '5m',
labels: {
severity: 'critical',
},
Expand Down

0 comments on commit 8f50211

Please sign in to comment.