diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 953cd8d3fc..291c97ae97 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -322,11 +322,12 @@ rules: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} + seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 + time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 4e110a54d5..f4c6e8d237 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -306,12 +306,12 @@ groups: severity: critical - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} - seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than + {{$value}} seconds. runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 600 + time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 labels: severity: critical - name: thanos-store diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 7aace50da6..951dcec9b4 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -8,9 +8,9 @@ tests: - interval: 1m input_series: - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}' - values: '5 10 43 17 11 0 0 0' + values: '5 10 43 17 11 _x5 0x10' - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}' - values: '4 9 42 15 10 0 0 0' + values: '4 9 42 15 10 _x5 0x10' promql_expr_test: - expr: time() eval_time: 1m @@ -22,112 +22,64 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) - eval_time: 2m - exp_samples: - - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 43 - - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 42 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) - eval_time: 10m - exp_samples: - - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 0 - - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 0 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) - eval_time: 11m + - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) + eval_time: 5m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 0 + value: 60 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 0 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) - eval_time: 10m + value: 60 + - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) + eval_time: 6m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 600 + value: 120 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 600 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) - eval_time: 11m + value: 120 + - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) + eval_time: 7m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 660 + value: 180 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 660 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600 - eval_time: 12m + value: 180 + - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) + eval_time: 8m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 720 + value: 240 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 720 + value: 240 alert_rule_test: - - eval_time: 1m - alertname: ThanosSidecarUnhealthy - - eval_time: 2m - alertname: ThanosSidecarUnhealthy - - eval_time: 3m - alertname: ThanosSidecarUnhealthy - - eval_time: 10m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 600 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 600 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - eval_time: 11m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 660 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 660 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - eval_time: 12m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 720 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 720 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 1m + alertname: ThanosSidecarUnhealthy + - eval_time: 2m + alertname: ThanosSidecarUnhealthy + - eval_time: 3m + alertname: ThanosSidecarUnhealthy + - eval_time: 5m + alertname: ThanosSidecarUnhealthy + - eval_time: 8m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 10m + alertname: ThanosSidecarUnhealthy - interval: 1m input_series: - series: 'prometheus_rule_evaluations_total{namespace="production", job="thanos-ruler", instance="thanos-ruler-0"}' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c5978163c1..4e21f0785a 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -41,11 +41,11 @@ { alert: 'ThanosSidecarUnhealthy', annotations: { - description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for {{$value}} seconds.' % location, + description: 'Thanos Sidecar {{$labels.instance}}%s is unhealthy for more than {{$value}} seconds.' % location, summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| - time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 600 + time() - max by (%(dimensions)s) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s})) >= 240 ||| % thanos.sidecar, labels: { severity: 'critical', diff --git a/mixin/runbook.md b/mixin/runbook.md index fc7d84c3b9..37d2a4ff7d 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -86,7 +86,7 @@ |---|---|---|---|---| |ThanosSidecarPrometheusDown|Thanos Sidecar cannot connect to Prometheus|Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarprometheusdown)| |ThanosSidecarBucketOperationsFailed|Thanos Sidecar bucket operations are failing|Thanos Sidecar {{$labels.instance}} bucket operations are failing|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed)| -|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| +|ThanosSidecarUnhealthy|Thanos Sidecar is unhealthy.|Thanos Sidecar {{$labels.instance}} is unhealthy for more than {{$value}} seconds.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy)| ## thanos-store