From 58b6ee346fccbe7d263ab549535bfddb8b222701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 14 Mar 2024 10:04:12 +0100 Subject: [PATCH 1/9] dashboards: overview: use native histograms in status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow switching between basing status on classic or native version of cortex_request_duration_seconds. Related to #7154 Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 + .../dashboards/mimir-overview.json | 57 ++++++++++- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/mimir-overview.json | 57 ++++++++++- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/dashboard-queries.libsonnet | 99 +++++++++++++++---- .../mimir-mixin/dashboards/overview.libsonnet | 19 +++- operations/mimir-mixin/jsonnetfile.lock.json | 8 +- 10 files changed, 211 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8c49022844..d40f6945686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ ### Mixin * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591 +* [ENHANCEMENT] Dashboards: allow switching between using classic of native histograms in dashboards. #7627 + Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. ### Jsonnet diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index df6aa21b232..29ad0fd641f 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -91,7 +91,27 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -348,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -670,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1450,6 +1470,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 0f5b707d489..583aec28a6a 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 7995372bfc4..309c7009072 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index df6aa21b232..29ad0fd641f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -91,7 +91,27 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -348,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -670,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1450,6 +1470,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 1d05752d68f..8858735a47d 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index c91b1b084bd..d5cb7699c18 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index ca75f19c490..30ffe12acc3 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, @@ -25,55 +27,102 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, - readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + }, + }, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, + }, + }, }, distributor: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + local p = self, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate:: { + local template = ||| ( # gRPC errors are not tracked as 5xx but "error". - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + }, + }, }, query_frontend: { - readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, @@ -88,16 +137,26 @@ otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)"}[$__rate_interval]))' % variables, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, + }, + }, }, ruler: { diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index 5b754d73b61..bc3a7def14a 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -33,6 +33,7 @@ local filename = 'mimir-overview.json'; assert std.md5(filename) == 'ffcd83628d7d4b5a03d1cafd159e6c9c' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Overview') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRow( $.row('%(product)s cluster health' % $._config) @@ -53,9 +54,21 @@ local filename = 'mimir-overview.json'; 'Status', [ // Write failures. - if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), + // Write failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), // Read failures. - if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), + // Read failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), // Rule evaluation failures. $.queries.ruler.evaluations.failuresRate, // Alerting notifications. @@ -84,7 +97,7 @@ local filename = 'mimir-overview.json'; // Object storage failures. $.queries.storage.failuresRate, ], - ['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] + ['Writes', 'Writes', 'Reads', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] ) ) .addPanel( diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 2dde18000fd..dcbb4bcca96 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "87011e1ef43aa4cd6ae2777b41eb54877c7790e9", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "87011e1ef43aa4cd6ae2777b41eb54877c7790e9", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" } ], "legacyImports": false From 101c1d5123964d1b34ff008bef7a5416e1b2a52f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 20 Mar 2024 10:13:19 +0100 Subject: [PATCH 2/9] Format mixin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/dashboard-queries.libsonnet | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 30ffe12acc3..6dd979d2816 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -40,45 +40,45 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Write failures rate as percentage of total requests. writeFailuresRate:: { local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, - classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, - }, - native: template % { - countFailQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, - countQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, - }, + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, + countQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + }, }, // Read failures rate as percentage of total requests. readFailuresRate:: { local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, }, native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, }, }, @@ -96,22 +96,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Write failures rate as percentage of total requests. writeFailuresRate:: { local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, }, native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, }, }, @@ -139,21 +139,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Read failures rate as percentage of total requests. readFailuresRate:: { local template = ||| - ( - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, + ( + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, }, native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, }, }, From 497ba5baba6bb7e4c94c8f928caee3b41059545e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 20 Mar 2024 10:15:09 +0100 Subject: [PATCH 3/9] Missing asterisk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d40f6945686..d8432b831cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591 * [ENHANCEMENT] Dashboards: allow switching between using classic of native histograms in dashboards. #7627 - Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. + * Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. ### Jsonnet From 385cf113fe3c9d26356eae78537d8c416685ef5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 20 Mar 2024 10:49:33 +0100 Subject: [PATCH 4/9] Update helm chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../charts/mimir-distributed/CHANGELOG.md | 3 + .../metamonitoring/grafana-dashboards.yaml | 61 +++++++++++++++++-- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index 1cab9b66e14..f1c0658c447 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -58,6 +58,9 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] Make the PSP template configurable via `rbac.podSecurityPolicy`. #7190 * [ENHANCEMENT] Recording rules: add native histogram recording rules to `cortex_request_duration_seconds`. #7528 * [ENHANCEMENT] Make the port used in ServiceMonitor for kube-state-metrics configurable. #7507 +* [ENHANCEMENT] Dashboards: allow switching between using classic of native histograms in dashboards. #7627 + * Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. + * [BUGFIX] Metamonitoring: update dashboards to drop unsupported `step` parameter in targets. #7157 * [BUGFIX] Recording rules: drop rules for metrics removed in 2.0: `cortex_memcache_request_duration_seconds` and `cortex_cache_request_duration_seconds`. #7514 * [BUGFIX] Store-gateway: setting "resources.requests.memory" with a quantity that used power-of-ten SI suffix, caused an error. #7506 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 9b8e3ce6946..c3a1e35b3dd 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -9441,7 +9441,7 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -9451,7 +9451,27 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -9708,7 +9728,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -10030,7 +10050,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -10810,6 +10830,35 @@ data: "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, @@ -18025,7 +18074,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -37557,7 +37606,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" From 3d40f6aafd322a40af854b126c0ce3bfa539a7f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 20 Mar 2024 14:27:45 +0100 Subject: [PATCH 5/9] Do not change deprecated selector, just deprecate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../metamonitoring/grafana-dashboards.yaml | 8 ++++---- .../dashboards/mimir-overview.json | 4 ++-- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/mimir-overview.json | 4 ++-- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/dashboard-queries.libsonnet | 18 ++++++++++-------- 8 files changed, 22 insertions(+), 20 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index c3a1e35b3dd..be111989052 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -9728,7 +9728,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -10050,7 +10050,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -18074,7 +18074,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -37606,7 +37606,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 29ad0fd641f..aeac3f9266e 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -368,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -690,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 583aec28a6a..0f5b707d489 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 309c7009072..7995372bfc4 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 29ad0fd641f..aeac3f9266e 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -368,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -690,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 8858735a47d..1d05752d68f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index d5cb7699c18..c91b1b084bd 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 6dd979d2816..2590daddc79 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -27,15 +27,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { + // deprecated, will be removed + writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, + readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, - // deprecated, will be removed - writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], readRequestsPerSecondMetric: 'cortex_request_duration_seconds', readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, - // deprecated, will be removed - readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], // Write failures rate as percentage of total requests. writeFailuresRate:: { @@ -85,11 +85,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, distributor: { + // deprecated, will be removed + writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + local p = self, writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, - // deprecated, will be removed - writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, @@ -118,11 +119,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, query_frontend: { + // deprecated, will be removed + readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, readRequestsPerSecondMetric: 'cortex_request_duration_seconds', readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, - // deprecated, will be removed - readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, From a85c39a86e99f2a26e15baadce12e3d891bb7b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 21 Mar 2024 09:14:18 +0100 Subject: [PATCH 6/9] Remove extra line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbdf8454753..c379f3de23a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,6 @@ * Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. * [BUGFIX] Dashobards: Fix regular expression for matching read-path gRPC ingester methods to include querying of exemplars, label-related queries, or active series queries. #7676 - ### Jsonnet ### Mimirtool From 61f4291d49a9f8cb6e0e07dbef55277317870cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 21 Mar 2024 09:14:48 +0100 Subject: [PATCH 7/9] Remove extra line 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/helm/charts/mimir-distributed/CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index f1c0658c447..5a3c0066da5 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -60,7 +60,6 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] Make the port used in ServiceMonitor for kube-state-metrics configurable. #7507 * [ENHANCEMENT] Dashboards: allow switching between using classic of native histograms in dashboards. #7627 * Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. - * [BUGFIX] Metamonitoring: update dashboards to drop unsupported `step` parameter in targets. #7157 * [BUGFIX] Recording rules: drop rules for metrics removed in 2.0: `cortex_memcache_request_duration_seconds` and `cortex_cache_request_duration_seconds`. #7514 * [BUGFIX] Store-gateway: setting "resources.requests.memory" with a quantity that used power-of-ten SI suffix, caused an error. #7506 From 10013fc016483d1483d8785dde0cca0a78fbcf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 4 Apr 2024 15:33:48 +0200 Subject: [PATCH 8/9] Use only module.requestPerSecondMetric variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit instead of separate but equal readRequestPerSecondMetric and writeRequestPerSecondMetric Signed-off-by: György Krajcsovits --- .../dashboards/dashboard-queries.libsonnet | 21 +++++++++---------- operations/mimir-mixin/jsonnetfile.lock.json | 4 ++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 044cbee7d38..228a35784be 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -32,9 +32,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, local p = self, - writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: 'cortex_request_duration_seconds', writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, - readRequestsPerSecondMetric: 'cortex_request_duration_seconds', readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // Write failures rate as percentage of total requests. @@ -51,12 +50,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(%(countQuery)s) |||, classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, }, native: template % { - countFailQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + countFailQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, + countQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, }, }, @@ -89,7 +88,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, local p = self, - writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: 'cortex_request_duration_seconds', writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, @@ -108,12 +107,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(%(countQuery)s) |||, classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, }, native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, }, }, }, diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 3cad418eca0..eb7c7fcc195 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "version": "f95501009c9b29bed87fe9d57c1a6e72e210f137", "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { @@ -18,7 +18,7 @@ "subdir": "mixin-utils" } }, - "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "version": "f95501009c9b29bed87fe9d57c1a6e72e210f137", "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" } ], From 304205106ec52552d80e6a45419888cdcf72e718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 4 Apr 2024 16:02:44 +0200 Subject: [PATCH 9/9] Factor out failure rate query templating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../metamonitoring/grafana-dashboards.yaml | 4 +- .../dashboards/mimir-overview.json | 4 +- .../dashboards/mimir-overview.json | 4 +- .../dashboards/dashboard-queries.libsonnet | 111 +++++------------- 4 files changed, 34 insertions(+), 89 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index e961fb482f9..15979c0f123 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -9461,7 +9461,7 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -9471,7 +9471,7 @@ data: "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 13f4198d724..4cec7e767ae 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -101,7 +101,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -111,7 +111,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 13f4198d724..4cec7e767ae 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -101,7 +101,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -111,7 +111,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 228a35784be..70bbeb09c4a 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,6 +1,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; { + // Helper function to produce failure rate in percentage queries for native and classic histograms. + // Takes a metric name and a selector as strings and returns a dictionary with classic and native queries. + nativeClassicFailureRate(metric, selector):: { + local template = ||| + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(metric, selector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(metric, selector).native, + }, + }, + // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, // but they're defined in a common place just to share them between different dashboards. @@ -37,50 +61,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate:: { - local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, - classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, - }, - native: template % { - countFailQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, - }, - }, + writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), // Read failures rate as percentage of total requests. - readFailuresRate:: { - local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, - classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, - }, - native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, - }, - }, + readFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector), }, distributor: { @@ -94,27 +78,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate:: { - local template = ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, - classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, - }, - native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, - }, - }, + writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), }, query_frontend: { @@ -174,26 +138,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'), // Read failures rate as percentage of total requests. - readFailuresRate:: { - local template = ||| - ( - sum(%(countFailQuery)s) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(%(countQuery)s) - |||, - classic: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic, - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, - }, - native: template % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, - }, - }, + readFailuresRate: $.nativeClassicFailureRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector), }, ruler: {