From c2a31715c87f238da77ec001d868880f5025940e Mon Sep 17 00:00:00 2001 From: Haoyu Sun Date: Thu, 23 Jun 2022 15:55:13 +0200 Subject: [PATCH] add alert ThanosQueryOverload to mixin Signed-off-by: Haoyu Sun --- CHANGELOG.md | 1 + examples/alerts/alerts.yaml | 16 ++++++++++++++++ mixin/alerts/query.libsonnet | 16 ++++++++++++++++ mixin/runbook.md | 1 + 4 files changed, 34 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 801ba0c1079..b709429c3d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5408](https://github.com/thanos-io/thanos/pull/5391) Receive: Add support for consistent hashrings. - [#5391](https://github.com/thanos-io/thanos/pull/5391) Receive: Implement api/v1/status/tsdb. - [#5424](https://github.com/thanos-io/thanos/pull/5424) Receive: export metrics regarding size of remote write requests +- [#5439](https://github.com/thanos-io/thanos/pull/5424) Add Alert ThanosQueryOverload to Mixin. ### Changed diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 6aaf34b24dd..be7b48db108 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -170,6 +170,22 @@ groups: for: 10m labels: severity: critical + - alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than + 15 minutes. This may be a symptom of excessive simultanous complex requests, + low performance of the Prometheus API, or failures within these components. + Assess the health of the Thanos query instances, the connnected Prometheus + instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning - name: thanos-receive rules: - alert: ThanosReceiveHttpRequestErrorRateHigh diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index bbb33d855cd..80dd85287a2 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -142,6 +142,22 @@ severity: 'critical', }, }, + { + alert: 'ThanosQueryOverload', + annotations: { + description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location, + summary: 'Thanos query reaches its maximum capacity serving concurrent requests.', + }, + expr: ||| + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + ||| % thanos.query, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index 967c9bcee9c..6f251c1be60 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -50,6 +50,7 @@ |ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{$value humanize}}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| |ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| |ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| +|ThanosQueryOverload|Thanos query reaches its maximum capacity serving concurrent requests.|Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload)| ## thanos-receive