From 3609ed9a953d7ea6ef67a375479972bfaf65a378 Mon Sep 17 00:00:00 2001 From: Arunprasad Rajkumar Date: Mon, 14 Jun 2021 19:46:14 +0530 Subject: [PATCH 1/2] Revert "mixin: Use sidecar's metric timestamp for healthcheck (#3204) (#3979)" This reverts commit 5139e339eca62787e3a8dc38af5d4a9bf3ea39c4. Signed-off-by: Arunprasad Rajkumar --- examples/alerts/alerts.md | 5 +- examples/alerts/alerts.yaml | 6 +- examples/alerts/tests.yaml | 142 ++++++++++++++++++++++----------- mixin/alerts/sidecar.libsonnet | 4 +- mixin/runbook.md | 2 +- 5 files changed, 103 insertions(+), 56 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 7209920a60..b24be8b458 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -322,12 +322,11 @@ rules: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} - seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 7c9f08bdc7..9dd7c49a1b 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -323,12 +323,12 @@ groups: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than - {{$value}} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} + seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 labels: severity: critical - name: thanos-store diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 951dcec9b4..7aace50da6 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -8,9 +8,9 @@ tests: - interval: 1m input_series: - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}' - values: '5 10 43 17 11 _x5 0x10' + values: '5 10 43 17 11 0 0 0' - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}' - values: '4 9 42 15 10 _x5 0x10' + values: '4 9 42 15 10 0 0 0' promql_expr_test: - expr: time() eval_time: 1m @@ -22,64 +22,112 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 2m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 60 + value: 43 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 60 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 6m + value: 42 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 120 + value: 0 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 120 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 7m + value: 0 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 0 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 0 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 600 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 180 + value: 660 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 180 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 8m + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600 + eval_time: 12m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 240 + value: 720 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 240 + value: 720 alert_rule_test: - - eval_time: 1m - alertname: ThanosSidecarUnhealthy - - eval_time: 2m - alertname: ThanosSidecarUnhealthy - - eval_time: 3m - alertname: ThanosSidecarUnhealthy - - eval_time: 5m - alertname: ThanosSidecarUnhealthy - - eval_time: 8m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - eval_time: 10m - alertname: ThanosSidecarUnhealthy + - eval_time: 1m + alertname: ThanosSidecarUnhealthy + - eval_time: 2m + alertname: ThanosSidecarUnhealthy + - eval_time: 3m + alertname: ThanosSidecarUnhealthy + - eval_time: 10m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 11m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 12m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' - interval: 1m input_series: - series: 'prometheus_rule_evaluations_total{namespace="production", job="thanos-ruler", instance="thanos-ruler-0"}' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index 4e21f0785a..c5978163c1 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -41,11 +41,11 @@ { alert: 'ThanosSidecarUnhealthy', annotations: { - description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for more than {{$value}} seconds.' % location, + description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for {{$value}} seconds.' % location, summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| - time() - max by (%(dimensions)s) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s})) >= 240 + time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical', diff --git a/mixin/runbook.md b/mixin/runbook.md index 03f92aed71..d4bbb571ed 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -87,7 +87,7 @@ |---|---|---|---|---| |ThanosSidecarPrometheusDown|Thanos Sidecar cannot connect to Prometheus|Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown)| |ThanosSidecarBucketOperationsFailed|Thanos Sidecar bucket operations are failing|Thanos Sidecar {{$labels.instance}} bucket operations are failing|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed)| -|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| +|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| ## thanos-store From 1295005a912ef6453eeb01f618fc12524eccfb97 Mon Sep 17 00:00:00 2001 From: Arunprasad Rajkumar Date: Mon, 14 Jun 2021 23:27:42 +0530 Subject: [PATCH 2/2] fix(mixin): ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy Signed-off-by: Arunprasad Rajkumar --- CHANGELOG.md | 2 +- examples/alerts/alerts.md | 6 ++++-- examples/alerts/alerts.yaml | 7 ++++--- examples/alerts/tests.yaml | 12 ++++++------ mixin/alerts/sidecar.libsonnet | 5 +++-- mixin/runbook.md | 2 +- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aef921ec11..8e9b206d8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Fixed -- +- [#4342](https://github.com/thanos-io/thanos/pull/4342) ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index b24be8b458..21b5438564 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -322,11 +322,13 @@ rules: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} + seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240 + for: 5m labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 9dd7c49a1b..8c0d7d7340 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -323,12 +323,13 @@ groups: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} - seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than + {{$value}} seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240 + for: 5m labels: severity: critical - name: thanos-store diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 7aace50da6..64207c46f7 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -79,7 +79,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-0 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 600 seconds.' + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 600 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - exp_labels: @@ -87,7 +87,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-1 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 600 seconds.' + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 600 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - eval_time: 11m @@ -98,7 +98,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-0 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 660 seconds.' + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 660 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - exp_labels: @@ -106,7 +106,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-1 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 660 seconds.' + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 660 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - eval_time: 12m @@ -117,7 +117,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-0 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 720 seconds.' + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 720 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - exp_labels: @@ -125,7 +125,7 @@ tests: job: thanos-sidecar instance: thanos-sidecar-1 exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 720 seconds.' + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 720 seconds.' runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' summary: 'Thanos Sidecar is unhealthy.' - interval: 1m diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c5978163c1..b468210619 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -41,12 +41,13 @@ { alert: 'ThanosSidecarUnhealthy', annotations: { - description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for {{$value}} seconds.' % location, + description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for more than {{$value}} seconds.' % location, summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| - time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 600 + time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 240 ||| % thanos.sidecar, + 'for': '5m', labels: { severity: 'critical', }, diff --git a/mixin/runbook.md b/mixin/runbook.md index d4bbb571ed..03f92aed71 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -87,7 +87,7 @@ |---|---|---|---|---| |ThanosSidecarPrometheusDown|Thanos Sidecar cannot connect to Prometheus|Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown)| |ThanosSidecarBucketOperationsFailed|Thanos Sidecar bucket operations are failing|Thanos Sidecar {{$labels.instance}} bucket operations are failing|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed)| -|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| +|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| ## thanos-store