diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ccdaee136..99c3d56d33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5555](https://github.com/thanos-io/thanos/pull/5555) Query: Added `--query.active-query-path` flag, allowing the user to configure the directory to create an active query tracking file, `queries.active`, for different resolution. - [#5566](https://github.com/thanos-io/thanos/pull/5566) Receive: Added experimental support to enable chunk write queue via `--tsdb.write-queue-size` flag. - [#5575](https://github.com/thanos-io/thanos/pull/5575) Receive: Add support for gRPC compression with snappy. +- [#5439](https://github.com/thanos-io/thanos/pull/5439) Add Alert ThanosQueryOverload to Mixin. ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index f91ecafa68..44c4a0529a 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -409,6 +409,18 @@ rules: for: 10m labels: severity: critical +- alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning ``` ## Receive diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 9ff8868127..ed3a452123 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -158,6 +158,18 @@ groups: for: 10m labels: severity: critical + - alert: ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload + summary: Thanos query reaches its maximum capacity serving concurrent requests. + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 15m + labels: + severity: warning - name: thanos-receive rules: - alert: ThanosReceiveHttpRequestErrorRateHigh diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index bbb33d855c..80dd85287a 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -142,6 +142,22 @@ severity: 'critical', }, }, + { + alert: 'ThanosQueryOverload', + annotations: { + description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location, + summary: 'Thanos query reaches its maximum capacity serving concurrent requests.', + }, + expr: ||| + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + ||| % thanos.query, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index 967c9bcee9..6f251c1be6 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -50,6 +50,7 @@ |ThanosQueryHighDNSFailures|Thanos Query is having high number of DNS failures.|Thanos Query {{$labels.job}} have {{$value humanize}}% of failing DNS queries for store endpoints.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures)| |ThanosQueryInstantLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh)| |ThanosQueryRangeLatencyHigh|Thanos Query has high latency for queries.|Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh)| +|ThanosQueryOverload|Thanos query reaches its maximum capacity serving concurrent requests.|Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload)| ## thanos-receive diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index 032553ad47..436a695d1b 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -59,7 +59,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-query", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,