diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc2b98ccfd..59864b21205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5527](https://github.com/thanos-io/thanos/pull/5527) Receive: Add per request limits for remote write. - [#5520](https://github.com/thanos-io/thanos/pull/5520) Receive: Meta-monitoring based active series limiting - [#5555](https://github.com/thanos-io/thanos/pull/5555) Query: Added `--query.active-query-path` flag, allowing the user to configure the directory to create an active query tracking file, `queries.active`, for different resolution. +- [#5439](https://github.com/thanos-io/thanos/pull/5439) Add Alert ThanosQueryOverload to Mixin. ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index f91ecafa686..44c4a0529a3 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -409,6 +409,18 @@ rules: for: 10m labels: severity: critical +- alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning ``` ## Receive diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 9ff88681276..ed3a4521231 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -158,6 +158,18 @@ groups: for: 10m labels: severity: critical + - alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning - name: thanos-receive rules: - alert: ThanosReceiveHttpRequestErrorRateHigh diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index bbb33d855cd..80dd85287a2 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -142,6 +142,22 @@ severity: 'critical', }, }, + { + alert: 'ThanosQueryOverload', + annotations: { + description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location, + summary: 'Thanos query reaches its maximum capacity serving concurrent requests.', + }, + expr: ||| + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + ||| % thanos.query, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index 967c9bcee9c..6f251c1be60 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -50,6 +50,7 @@ |ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{$value humanize}}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| |ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| |ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| +|ThanosQueryOverload|Thanos query reaches its maximum capacity serving concurrent requests.|Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload)| ## thanos-receive diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index 032553ad477..436a695d1b7 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -59,7 +59,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-query", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,