From 8abb224e8bfe094645908250aab0e426890ea717 Mon Sep 17 00:00:00 2001 From: Haoyu Sun Date: Thu, 23 Jun 2022 15:55:13 +0200 Subject: [PATCH] add alert ThanosQueryOverload to mixin Signed-off-by: Haoyu Sun --- CHANGELOG.md | 1 + examples/alerts/alerts.md | 16 ++++++++++++++++ examples/alerts/alerts.yaml | 16 ++++++++++++++++ mixin/alerts/query.libsonnet | 16 ++++++++++++++++ mixin/runbook.md | 1 + 5 files changed, 50 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4b2cde1099..f9ea1af55fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5475](https://github.com/thanos-io/thanos/pull/5475) Compact/Store: Added `--block-files-concurrency` allowing to configure number of go routines for download/upload block files during compaction. - [#5470](https://github.com/thanos-io/thanos/pull/5470) Receive: Implement exposing TSDB stats for all tenants - [#5493](https://github.com/thanos-io/thanos/pull/5493) Compact: Added `--compact.blocks-fetch-concurrency` allowing to configure number of go routines for download blocks during compactions. +- [#5439](https://github.com/thanos-io/thanos/pull/5439) Add Alert ThanosQueryOverload to Mixin. ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index f91ecafa686..11b2a69995d 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -409,6 +409,22 @@ rules: for: 10m labels: severity: critical +- alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 + minutes. This may be a symptom of excessive simultanous complex requests, low + performance of the Prometheus API, or failures within these components. Assess + the health of the Thanos query instances, the connnected Prometheus instances, + look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning ``` ## Receive diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 9ff88681276..b0af1fb9e57 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -158,6 +158,22 @@ groups: for: 10m labels: severity: critical + - alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than + 15 minutes. This may be a symptom of excessive simultanous complex requests, + low performance of the Prometheus API, or failures within these components. + Assess the health of the Thanos query instances, the connnected Prometheus + instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning - name: thanos-receive rules: - alert: ThanosReceiveHttpRequestErrorRateHigh diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index bbb33d855cd..80dd85287a2 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -142,6 +142,22 @@ severity: 'critical', }, }, + { + alert: 'ThanosQueryOverload', + annotations: { + description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location, + summary: 'Thanos query reaches its maximum capacity serving concurrent requests.', + }, + expr: ||| + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + ||| % thanos.query, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index 967c9bcee9c..6f251c1be60 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -50,6 +50,7 @@ |ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{$value humanize}}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| |ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| |ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| +|ThanosQueryOverload|Thanos query reaches its maximum capacity serving concurrent requests.|Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload)| ## thanos-receive