From b425b3c33e9bd2a29c70c98822c2de7e56187dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 10 Jan 2024 10:18:50 +0100 Subject: [PATCH 01/14] Add native histogram support to mimir-mixin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recording rules, alerts, dashboards. Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 23 ++++++++----------- .../dashboards/mimir-writes.json | 23 ++++++++----------- .../dashboards/dashboard-queries.libsonnet | 7 ++++++ .../mimir-mixin/dashboards/writes.libsonnet | 8 +++---- operations/mimir-mixin/jsonnetfile.json | 2 +- operations/mimir-mixin/jsonnetfile.lock.json | 6 ++--- 6 files changed, 35 insertions(+), 34 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 31ef17f3438..85e71fea4a6 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -515,7 +515,7 @@ }, "yaxes": [ { - "format": "reqps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -564,25 +564,22 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A", - "step": 10 + "legendFormat": "99th Percentile", + "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B", - "step": 10 + "legendFormat": "50th Percentile", + "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_sum[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_count[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ ], diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index 89cfc40d7c9..b3cb15b1912 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -515,7 +515,7 @@ }, "yaxes": [ { - "format": "reqps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -564,25 +564,22 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A", - "step": 10 + "legendFormat": "99th Percentile", + "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B", - "step": 10 + "legendFormat": "50th Percentile", + "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_sum[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_count[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", - "refId": "C", - "step": 10 + "refId": "C" } ], "thresholds": [ ], diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index a75420bd234..d9a5a69d3dc 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -28,6 +28,11 @@ writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + // Write failures rate as percentage of total requests. writeFailuresRate: ||| ( @@ -55,6 +60,8 @@ distributor: { writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index 58041cf4473..e4bb29d3174 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -107,11 +107,11 @@ local filename = 'mimir-writes.json'; $.row('Gateway') .addPanel( $.panel('Requests / sec') + - $.qpsPanel($.queries.gateway.writeRequestsPerSecond) + $.qpsPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) + $.latencyPanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + @@ -133,7 +133,7 @@ local filename = 'mimir-writes.json'; When distributor is not configured to use "early" request rejection, then rejected requests are also counted as "errors". ||| ) + - $.qpsPanel($.queries.distributor.writeRequestsPerSecond) + + $.qpsPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) + if $._config.show_rejected_requests_on_writes_dashboard then { targets: [ { @@ -152,7 +152,7 @@ local filename = 'mimir-writes.json'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) + $.latencyPanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index 3f1547aaebd..a720598df46 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "master" + "version": "1bc39b51eee1d697539b61fd78528207881645f4" }, { "source": { diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 8164014edf0..6ed9cf0b890 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "02db06f540086fa3f67d487bd01e1b314853fb8f", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "1bc39b51eee1d697539b61fd78528207881645f4", + "sum": "U0bOj0L36filr/yYqM0oaftUJFVCLPDUWQNO7SOIbXU=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "mixin-utils" } }, - "version": "02db06f540086fa3f67d487bd01e1b314853fb8f", + "version": "359f7602f0dd83515c8596499d25df2b7274fa84", "sum": "PGf+vyCHqGxxS6SKNZiN3vR1xPnw6VOESXbeJrA5FaA=" } ], From 4e04b71358dfa1c1e41b48bab4b81ee531f10fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jan 2024 15:47:39 +0100 Subject: [PATCH 02/14] Use predefined variables for latency panel metric and selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same as for QPS panel. Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 6 +++--- .../mimir-mixin-compiled/dashboards/mimir-writes.json | 6 +++--- operations/mimir-mixin/dashboards/writes.libsonnet | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 85e71fea4a6..52ef33fad5c 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", + "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", + "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_sum[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_count[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))\n", + "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index b3cb15b1912..e2791f097a2 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", + "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) by (le))) * 1e3\n", + "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_sum[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval])) or rate(cortex_request_duration_seconds_count[{\"label\": \"cluster\", \"op\": \"=~\", \"value\": \"$cluster\"}, {\"label\": \"job\", \"op\": \"=~\", \"value\": \"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\"}, {\"label\": \"route\", \"op\": \"=~\", \"value\": \"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}][$__rate_interval]))\n", + "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index e4bb29d3174..9faa319a33c 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -152,7 +152,7 @@ local filename = 'mimir-writes.json'; ) .addPanel( $.panel('Latency') + - $.latencyPanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) + $.latencyPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + From d785c2a1ec1a549725e39b27d1407288186b02d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 17 Jan 2024 15:52:18 +0100 Subject: [PATCH 03/14] Use variable for gateway write latency panel instead of duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- operations/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index 9faa319a33c..61d84c54b97 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -111,7 +111,7 @@ local filename = 'mimir-writes.json'; ) .addPanel( $.panel('Latency') + - $.latencyPanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) + $.latencyPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + From dc5d71d518ed09622c8e0f046f6a9cf8e0182ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 07:30:07 +0100 Subject: [PATCH 04/14] Add qpsPanelNativeHistogram yaxes override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise yaxes is changed from "reqps" to "short" in the generated dashboard. Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 2 +- operations/mimir-mixin-compiled/dashboards/mimir-writes.json | 2 +- operations/mimir-mixin/dashboards/dashboard-utils.libsonnet | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 52ef33fad5c..7c9d29d3975 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -515,7 +515,7 @@ }, "yaxes": [ { - "format": "short", + "format": "reqps", "label": null, "logBase": 1, "max": null, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index e2791f097a2..6b59619b32e 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -515,7 +515,7 @@ }, "yaxes": [ { - "format": "short", + "format": "reqps", "label": null, "logBase": 1, "max": null, diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 4c99d39e032..742d13b685c 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -179,6 +179,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.qpsPanel(selector, statusLabelName) + { yaxes: $.yaxes('reqps') }, + qpsPanelNativeHistogram(selector, statusLabelName='status_code'):: + super.qpsPanelNativeHistogram(selector, statusLabelName) + + { yaxes: $.yaxes('reqps') }, + // hiddenLegendQueryPanel adds on to 'timeseriesPanel', not the deprecated 'panel'. // It is a standard query panel designed to handle a large number of series. it hides the legend, doesn't fill the series and // shows all values on tooltip, descending. Also turns on exemplars, unless 4th parameter is false. From 0289cb2263eca5cd860bbb3eb8a143d38c93520a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 12:08:40 +0100 Subject: [PATCH 05/14] Update jsonnet-lib with fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 8 ++++---- .../mimir-mixin-compiled/dashboards/mimir-writes.json | 8 ++++---- operations/mimir-mixin/jsonnetfile.json | 4 ++-- operations/mimir-mixin/jsonnetfile.lock.json | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index ae41503f87c..f17a3ca78b4 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", + "expr": "histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", + "expr": "histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "sum(\n (histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index 8de86d5c743..dc7fb977203 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "(histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", + "expr": "histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "(histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or\n histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) by (le))) * 1e3\n", + "expr": "histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "sum(\n (histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index 93f7869d434..8e07c2b6556 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "9042b90e2c9151287effcddce739b2234c99b382" + "version": "190a5aed678d545a309853bdb37387918551d7f3" }, { "source": { @@ -17,7 +17,7 @@ "subdir": "mixin-utils" } }, - "version": "master" + "version": "190a5aed678d545a309853bdb37387918551d7f3" } ], "legacyImports": true diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 004c81bcaec..24aa065ab61 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "9042b90e2c9151287effcddce739b2234c99b382", - "sum": "U0bOj0L36filr/yYqM0oaftUJFVCLPDUWQNO7SOIbXU=" + "version": "190a5aed678d545a309853bdb37387918551d7f3", + "sum": "YGEdY6q9zdOCMF2d2u0AaBMEmrTnkSnmff0gXRgjg58=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "3d58bd591c278f3f342bc1e25399806c49ace104", - "sum": "vyT1akj0RbnIeb0L3cJ/HzLiOEm5lskwl/Xr34eHOZQ=" + "version": "190a5aed678d545a309853bdb37387918551d7f3", + "sum": "pIe5VZP3oZLK+yK8VdXzRlyUvXtOOaKyOhbb4bjrX4I=" } ], "legacyImports": false From d448d7d161e83e2e54022b243f2b8a3ddcd9afab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 12:30:15 +0100 Subject: [PATCH 06/14] Update jsonnet-lib with fixes, tweaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 6 +++--- .../mimir-mixin-compiled/dashboards/mimir-writes.json | 6 +++--- operations/mimir-mixin/jsonnetfile.json | 4 ++-- operations/mimir-mixin/jsonnetfile.lock.json | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index f17a3ca78b4..4aeff8584ce 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n (histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index dc7fb977203..46de705399a 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -564,19 +564,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) * 1e3", + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(\n (histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) * 1e3 /\n (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index 8e07c2b6556..db805dd084f 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "190a5aed678d545a309853bdb37387918551d7f3" + "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212" }, { "source": { @@ -17,7 +17,7 @@ "subdir": "mixin-utils" } }, - "version": "190a5aed678d545a309853bdb37387918551d7f3" + "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212" } ], "legacyImports": true diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 24aa065ab61..004e08a7a5e 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "190a5aed678d545a309853bdb37387918551d7f3", - "sum": "YGEdY6q9zdOCMF2d2u0AaBMEmrTnkSnmff0gXRgjg58=" + "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212", + "sum": "VAiHmfmqHKqeav+P7PAf8YVKmPGb5Q7fViQ1uLd1Xz8=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "mixin-utils" } }, - "version": "190a5aed678d545a309853bdb37387918551d7f3", + "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212", "sum": "pIe5VZP3oZLK+yK8VdXzRlyUvXtOOaKyOhbb4bjrX4I=" } ], From 3116458129bd44ba988ff2fcad33015c227aa976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 12:47:41 +0100 Subject: [PATCH 07/14] Update jsonnet-lib again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 8 ++++---- .../mimir-mixin-compiled/dashboards/mimir-writes.json | 8 ++++---- operations/mimir-mixin/jsonnetfile.json | 4 ++-- operations/mimir-mixin/jsonnetfile.lock.json | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 4aeff8584ce..e94dc7742c6 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -566,17 +566,17 @@ { "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", - "legendFormat": "99th Percentile", + "legendFormat": "99th percentile", "refId": "A" }, { "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", - "legendFormat": "50th Percentile", + "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index 46de705399a..eb52ad9d10f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -490,7 +490,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -566,17 +566,17 @@ { "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", - "legendFormat": "99th Percentile", + "legendFormat": "99th percentile", "refId": "A" }, { "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", - "legendFormat": "50th Percentile", + "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index db805dd084f..22a16908f13 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212" + "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57" }, { "source": { @@ -17,7 +17,7 @@ "subdir": "mixin-utils" } }, - "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212" + "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57" } ], "legacyImports": true diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 004e08a7a5e..40f546709ea 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212", - "sum": "VAiHmfmqHKqeav+P7PAf8YVKmPGb5Q7fViQ1uLd1Xz8=" + "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57", + "sum": "LuE0qvWz5gOxRTtPKm7lrjRlZZRemTCQtGykVkL/Dls=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "mixin-utils" } }, - "version": "e5a8012ea28654aeacb3929ac703fe7a7c61c212", + "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57", "sum": "pIe5VZP3oZLK+yK8VdXzRlyUvXtOOaKyOhbb4bjrX4I=" } ], From 418e679cfff8a28b7b545521f6068d1a47d36f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 13:42:14 +0100 Subject: [PATCH 08/14] Make per pod latency use utility function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 2 +- operations/mimir-mixin-compiled/dashboards/mimir-writes.json | 2 +- operations/mimir-mixin/dashboards/writes.libsonnet | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index e94dc7742c6..43abf346b91 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -658,7 +658,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index eb52ad9d10f..ed7653cded9 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -658,7 +658,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index 61d84c54b97..7bcbb3e2e7e 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -116,7 +116,7 @@ local filename = 'mimir-writes.json'; .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway), $.queries.write_http_routes_regex], '' + utils.nativeClassicHistogramQuantile('0.99', $.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' ) ) ) @@ -157,7 +157,7 @@ local filename = 'mimir-writes.json'; .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], '' + utils.nativeClassicHistogramQuantile('0.99', $.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' ) ) ) From ef0ebe21f8e652c9b4b44f9c2fd97b244d8570fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 13:50:27 +0100 Subject: [PATCH 09/14] Remove unused metric selector, update overview panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-overview.json | 2 +- .../dashboards/mimir-overview.json | 2 +- .../mimir-mixin/dashboards/dashboard-queries.libsonnet | 4 ++-- operations/mimir-mixin/dashboards/overview.libsonnet | 10 +++++++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 21dcae403b4..94d292946d3 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -215,7 +215,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 21dcae403b4..94d292946d3 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -215,7 +215,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index d9a5a69d3dc..21e1a11a2d2 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -25,7 +25,7 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, + //writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', @@ -59,7 +59,7 @@ }, distributor: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + //writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', writeRequestsPerSecondSelector: '{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index aa44e0cdf0a..34b6209898b 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -114,11 +114,15 @@ local filename = 'mimir-overview.json'; ) .addPanel( $.panel(std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + - $.qpsPanel( + $.qpsPanelNativeHistogram( + if $._config.gateway_enabled then + $.queries.gateway.writeRequestsPerSecondMetric + else + $.queries.distributor.writeRequestsPerSecondMetric, if $._config.gateway_enabled then - $.queries.gateway.writeRequestsPerSecond + $.queries.gateway.writeRequestsPerSecondSelector else - $.queries.distributor.writeRequestsPerSecond + $.queries.distributor.writeRequestsPerSecondSelector ) ) .addPanel( From 1b40065d6daa5847a3afc7285cc90437a0cc543f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 18 Jan 2024 14:22:17 +0100 Subject: [PATCH 10/14] Rewrite failure reate queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-overview.json | 2 +- .../dashboards/mimir-overview.json | 2 +- .../dashboards/dashboard-queries.libsonnet | 39 +++++++++++++------ operations/mimir-mixin/jsonnetfile.json | 4 +- operations/mimir-mixin/jsonnetfile.lock.json | 6 +-- 5 files changed, 34 insertions(+), 19 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 94d292946d3..6d647604ced 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "instant": false, "legendFormat": "Writes", "range": true diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 94d292946d3..6d647604ced 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", "instant": false, "legendFormat": "Writes", "range": true diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 21e1a11a2d2..2fea653d923 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, @@ -25,43 +27,53 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { + local p = self, //writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', - writeRequestsPerSecondSelector: '{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, + writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, readRequestsPerSecondMetric: 'cortex_request_duration_seconds', - readRequestsPerSecondSelector: '{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // Write failures rate as percentage of total requests. writeFailuresRate: ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + ||| % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"'), + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector), + }, // Read failures rate as percentage of total requests. readFailuresRate: ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + ||| % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"'), + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector), + }, }, distributor: { + local p = self, //writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', - writeRequestsPerSecondSelector: '{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, @@ -69,14 +81,17 @@ writeFailuresRate: ||| ( # gRPC errors are not tracked as 5xx but "error". - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + ||| % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"'), + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector), + }, }, query_frontend: { diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index 22a16908f13..790eee25d1b 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57" + "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0" }, { "source": { @@ -17,7 +17,7 @@ "subdir": "mixin-utils" } }, - "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57" + "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0" } ], "legacyImports": true diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 40f546709ea..88f24dec850 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57", + "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0", "sum": "LuE0qvWz5gOxRTtPKm7lrjRlZZRemTCQtGykVkL/Dls=" }, { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "256e4986fb94044b12dfb8f6a1ecfb82d0d2ed57", - "sum": "pIe5VZP3oZLK+yK8VdXzRlyUvXtOOaKyOhbb4bjrX4I=" + "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0", + "sum": "pI+bGWLbOjxVd+i943ECFtqJVQ9lB3/np9tO7h93q3E=" } ], "legacyImports": false From 9b630c966b5beaa31d451c12246b8fef2f8f9437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 19 Jan 2024 15:01:41 +0100 Subject: [PATCH 11/14] Update ingester panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-writes.json | 10 +++++----- .../mimir-mixin-compiled/dashboards/mimir-writes.json | 10 +++++----- operations/mimir-mixin/dashboards/writes.libsonnet | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 43abf346b91..c3db7ae10d9 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -722,7 +722,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -796,19 +796,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})) * 1e3", + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -890,7 +890,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index ed7653cded9..c8abaa886ad 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -722,7 +722,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -796,19 +796,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})) * 1e3", + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"})", + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -890,7 +890,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum by(le, pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", "format": "time_series", "legendFormat": "", "legendLink": null diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index 7bcbb3e2e7e..c8de4fcc8ea 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -99,7 +99,7 @@ local filename = 'mimir-writes.json'; .addPanelIf( $._config.gateway_enabled, $.panel('Requests / sec') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"%s"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.gateway), $.queries.write_http_routes_regex], format='reqps') + $.statPanel('sum(%s)' % utils.nativeClassicHistogramCountRate($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector), format='reqps') ) ) .addRowIf( @@ -175,7 +175,7 @@ local filename = 'mimir-writes.json'; When ingester is not configured to use "early" request rejection, then rejected requests are also counted as "errors". ||| ) + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) + + $.qpsPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) + if $._config.show_rejected_requests_on_writes_dashboard then { targets: [ { @@ -194,12 +194,12 @@ local filename = 'mimir-writes.json'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) + $.latencyPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + utils.nativeClassicHistogramQuantile('0.99', 'cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester), [$._config.per_instance_label]), '' ) ) ) From 31607194f8c307e1c2e9eb23d9a96798900e046a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 6 Feb 2024 15:14:43 +0100 Subject: [PATCH 12/14] wip:wip --- .../dashboards/mimir-overview.json | 271 ++- .../dashboards/mimir-writes.json | 1556 +---------------- .../dashboards/mimir-overview.json | 271 ++- .../dashboards/mimir-writes.json | 1556 +---------------- .../dashboards/dashboard-queries.libsonnet | 12 +- .../dashboards/dashboard-utils.libsonnet | 4 +- .../mimir-mixin/dashboards/overview.libsonnet | 3 +- .../mimir-mixin/dashboards/writes.libsonnet | 294 ++-- operations/mimir-mixin/jsonnetfile.json | 16 +- operations/mimir-mixin/jsonnetfile.lock.json | 16 +- .../lib/grafana-builder/grafana.libsonnet | 683 ++++++++ .../lib/mixin-utils/utils.libsonnet | 229 +++ 12 files changed, 1547 insertions(+), 3364 deletions(-) create mode 100644 operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet create mode 100644 operations/mimir-mixin/lib/mixin-utils/utils.libsonnet diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 6d647604ced..6099b8578ff 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "instant": false, "legendFormat": "Writes", "range": true @@ -174,70 +174,212 @@ "type": "text" }, { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "A_classic" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": "B", + "mode": "normal" + } + } + ] + } + ] + }, "fill": 10, "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, "linewidth": 0, "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, "stack": true, - "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "historic_{{status}}", + "refId": "A_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Write requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "reqps", @@ -1404,6 +1546,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "show", + "value": "1" + }, + "description": "When setting this option to 1, panels will query and show deprecated low precision histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Show historic data", + "multi": false, + "name": "show_classic_histograms", + "options": [ + { + "selected": false, + "text": "hide", + "value": "0" + }, + { + "selected": true, + "text": "show", + "value": "1" + } + ], + "query": "hide : 0,show : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index c3db7ae10d9..51c9d072d4c 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -443,1538 +443,6 @@ "title": "Headlines", "titleSize": "h6" }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Requests / sec\nThe rate of successful, failed and rejected requests to distributor.\nRejected requests are requests that distributor fails to handle because of distributor instance limits.\nWhen distributor is configured to use \"early\" request rejection, then rejected requests are NOT included in other metrics.\nWhen distributor is not configured to use \"early\" request rejection, then rejected requests are also counted as \"errors\".\n\n", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A" - }, - { - "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B" - }, - { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 0, - "lineWidth": 1, - "pointSize": 5, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - } - }, - "thresholds": { - "mode": "absolute", - "steps": [ ] - }, - "unit": "s" - }, - "overrides": [ ] - }, - "id": 9, - "links": [ ], - "options": { - "legend": { - "displayMode": "hidden", - "showLegend": false - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "span": 4, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", - "format": "time_series", - "legendFormat": "", - "legendLink": null - } - ], - "title": "Per instance p99 latency", - "type": "timeseries" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Requests / sec\nThe rate of successful, failed and rejected requests to ingester.\nRejected requests are requests that ingester fails to handle because of ingester instance limits (ingester-max-inflight-push-requests, ingester-max-inflight-push-requests-bytes, ingester-max-ingestion-rate).\nWhen ingester is configured to use \"early\" request rejection, then rejected requests are NOT included in other metrics.\nWhen ingester is not configured to use \"early\" request rejection, then rejected requests are also counted as \"errors\".\n\n", - "fill": 10, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A" - }, - { - "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B" - }, - { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))\n", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 0, - "lineWidth": 1, - "pointSize": 5, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - } - }, - "thresholds": { - "mode": "absolute", - "steps": [ ] - }, - "unit": "s" - }, - "overrides": [ ] - }, - "id": 12, - "links": [ ], - "options": { - "legend": { - "displayMode": "hidden", - "showLegend": false - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "span": 4, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum by (instance) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,instance) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", - "format": "time_series", - "legendFormat": "", - "legendLink": null - } - ], - "title": "Per instance p99 latency", - "type": "timeseries" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Replicas\nThe maximum and current number of distributor replicas.\nNote: The current number of replicas can still show 1 replica even when scaled to 0.\nBecause HPA never reports 0 replicas, the query will report 0 only if the HPA is not active.\n\n", - "fill": 1, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/Max .+/", - "dashes": true, - "fill": 0 - }, - { - "alias": "/Current .+/", - "fill": 0 - }, - { - "alias": "/Min .+/", - "dashes": true, - "fill": 0 - } - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Max {{ scaletargetref_name }}", - "legendLink": null - }, - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_status_current_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active\n * on (cluster, namespace, horizontalpodautoscaler)\n kube_horizontalpodautoscaler_status_condition{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\", condition=\"ScalingActive\", status=\"true\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Current {{ scaletargetref_name }}", - "legendLink": null - }, - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_min_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Min {{ scaletargetref_name }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", - "fill": 1, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", - "format": "time_series", - "legendFormat": "{{ scaler }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Scaling metric (CPU): Desired replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Scaling metric (memory): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", - "fill": 1, - "id": 15, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", - "format": "time_series", - "legendFormat": "{{ scaler }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Scaling metric (memory): Desired replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", - "fill": 1, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", - "format": "time_series", - "legendFormat": "{{scaler}} failures", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Autoscaler failures rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - autoscaling", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 17, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - key-value store for high-availability (HA) deduplication", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - key-value store for distributors ring", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester - key-value store for the ingesters ring", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "failed": "#E24D42", - "successful": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Uploaded blocks / sec\nThe rate of blocks being uploaded from the ingesters\nto object storage.\n\n", - "fill": 10, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(cortex_ingester_shipper_uploads_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "successful", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "failed", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Uploaded blocks / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Upload latency\nThe average, median (50th percentile), and 99th percentile time\nthe ingesters take to upload blocks to object storage.\n\n", - "fill": 1, - "id": 24, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Upload latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester - shipper", - "titleSize": "h6" - }, { "collapse": false, "height": "250px", @@ -1990,7 +458,7 @@ "datasource": "$datasource", "description": "### Compactions per second\nIngesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each\nactive time series; these blocks get periodically compacted (by default, every 2h).\nThis panel shows the rate of compaction operations across all TSDBs on all ingesters.\n\n", "fill": 10, - "id": 25, + "id": 7, "legend": { "avg": false, "current": false, @@ -2071,7 +539,7 @@ "datasource": "$datasource", "description": "### Compaction latency\nThe average, median (50th percentile), and 99th percentile time ingesters take to compact TSDB head blocks\non the local filesystem.\n\n", "fill": 1, - "id": 26, + "id": 8, "legend": { "avg": false, "current": false, @@ -2173,7 +641,7 @@ "datasource": "$datasource", "description": "### WAL truncations per second\nThe WAL is truncated each time a new TSDB block is written. This panel measures the rate of\ntruncations.\n\n", "fill": 10, - "id": 27, + "id": 9, "legend": { "avg": false, "current": false, @@ -2257,7 +725,7 @@ "datasource": "$datasource", "description": "### Checkpoints created per second\nCheckpoints are created as part of the WAL truncation process.\nThis metric measures the rate of checkpoint creation.\n\n", "fill": 10, - "id": 28, + "id": 10, "legend": { "avg": false, "current": false, @@ -2339,7 +807,7 @@ "unit": "s" } }, - "id": 29, + "id": 11, "links": [ ], "options": { "legend": { @@ -2372,7 +840,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 30, + "id": 12, "legend": { "avg": false, "current": false, @@ -2465,7 +933,7 @@ "datasource": "$datasource", "description": "### Distributor exemplars incoming rate\nThe rate of exemplars that have come in to the distributor, including rejected or deduped exemplars.\n\n", "fill": 1, - "id": 31, + "id": 13, "legend": { "avg": false, "current": false, @@ -2540,7 +1008,7 @@ "datasource": "$datasource", "description": "### Distributor exemplars received rate\nThe rate of received exemplars, excluding rejected and deduped exemplars.\nThis number can be sensibly lower than incoming rate because we dedupe the HA sent exemplars, and then reject based on time, see `cortex_discarded_exemplars_total` for specific reasons rates.\n\n", "fill": 1, - "id": 32, + "id": 14, "legend": { "avg": false, "current": false, @@ -2615,7 +1083,7 @@ "datasource": "$datasource", "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", "fill": 1, - "id": 33, + "id": 15, "legend": { "avg": false, "current": false, @@ -2690,7 +1158,7 @@ "datasource": "$datasource", "description": "### Ingester appended exemplars rate\nThe rate of exemplars appended in the ingesters.\nThis can be lower than ingested exemplars rate since TSDB does not append the same exemplar twice, and those can be frequent.\n\n", "fill": 1, - "id": 34, + "id": 16, "legend": { "avg": false, "current": false, @@ -2776,7 +1244,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 35, + "id": 17, "legend": { "avg": false, "current": false, @@ -2850,7 +1318,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 36, + "id": 18, "legend": { "avg": false, "current": false, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 6d647604ced..6099b8578ff 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "instant": false, "legendFormat": "Writes", "range": true @@ -174,70 +174,212 @@ "type": "text" }, { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "A_classic" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": "B", + "mode": "normal" + } + } + ] + } + ] + }, "fill": 10, "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, "linewidth": 0, "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, "stack": true, - "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "historic_{{status}}", + "refId": "A_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Write requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "reqps", @@ -1404,6 +1546,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "show", + "value": "1" + }, + "description": "When setting this option to 1, panels will query and show deprecated low precision histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Show historic data", + "multi": false, + "name": "show_classic_histograms", + "options": [ + { + "selected": false, + "text": "hide", + "value": "0" + }, + { + "selected": true, + "text": "show", + "value": "1" + } + ], + "query": "hide : 0,show : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index c8abaa886ad..51c9d072d4c 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -443,1538 +443,6 @@ "title": "Headlines", "titleSize": "h6" }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Requests / sec\nThe rate of successful, failed and rejected requests to distributor.\nRejected requests are requests that distributor fails to handle because of distributor instance limits.\nWhen distributor is configured to use \"early\" request rejection, then rejected requests are NOT included in other metrics.\nWhen distributor is not configured to use \"early\" request rejection, then rejected requests are also counted as \"errors\".\n\n", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A" - }, - { - "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B" - }, - { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 0, - "lineWidth": 1, - "pointSize": 5, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - } - }, - "thresholds": { - "mode": "absolute", - "steps": [ ] - }, - "unit": "s" - }, - "overrides": [ ] - }, - "id": 9, - "links": [ ], - "options": { - "legend": { - "displayMode": "hidden", - "showLegend": false - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "span": 4, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))", - "format": "time_series", - "legendFormat": "", - "legendLink": null - } - ], - "title": "Per pod p99 latency", - "type": "timeseries" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Requests / sec\nThe rate of successful, failed and rejected requests to ingester.\nRejected requests are requests that ingester fails to handle because of ingester instance limits (ingester-max-inflight-push-requests, ingester-max-inflight-push-requests-bytes, ingester-max-ingestion-rate).\nWhen ingester is configured to use \"early\" request rejection, then rejected requests are NOT included in other metrics.\nWhen ingester is not configured to use \"early\" request rejection, then rejected requests are also counted as \"errors\".\n\n", - "fill": 10, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "99th percentile", - "refId": "A" - }, - { - "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))) * 1e3", - "format": "time_series", - "legendFormat": "50th percentile", - "refId": "B" - }, - { - "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])) or rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))\n", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 0, - "lineWidth": 1, - "pointSize": 5, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - } - }, - "thresholds": { - "mode": "absolute", - "steps": [ ] - }, - "unit": "s" - }, - "overrides": [ ] - }, - "id": 12, - "links": [ ], - "options": { - "legend": { - "displayMode": "hidden", - "showLegend": false - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "span": 4, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum by (pod) (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le,pod) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",route=\"/cortex.Ingester/Push\"}[$__rate_interval])))", - "format": "time_series", - "legendFormat": "", - "legendLink": null - } - ], - "title": "Per pod p99 latency", - "type": "timeseries" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Replicas\nThe maximum and current number of distributor replicas.\nNote: The current number of replicas can still show 1 replica even when scaled to 0.\nBecause HPA never reports 0 replicas, the query will report 0 only if the HPA is not active.\n\n", - "fill": 1, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/Max .+/", - "dashes": true, - "fill": 0 - }, - { - "alias": "/Current .+/", - "fill": 0 - }, - { - "alias": "/Min .+/", - "dashes": true, - "fill": 0 - } - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_max_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Max {{ scaletargetref_name }}", - "legendLink": null - }, - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_status_current_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active\n * on (cluster, namespace, horizontalpodautoscaler)\n kube_horizontalpodautoscaler_status_condition{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\", condition=\"ScalingActive\", status=\"true\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Current {{ scaletargetref_name }}", - "legendLink": null - }, - { - "expr": "max by (scaletargetref_name) (\n kube_horizontalpodautoscaler_spec_min_replicas{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n # Add the scaletargetref_name label for readability\n + on (cluster, namespace, horizontalpodautoscaler) group_left (scaletargetref_name)\n 0*kube_horizontalpodautoscaler_info{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}\n)\n", - "format": "time_series", - "legendFormat": "Min {{ scaletargetref_name }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Scaling metric (CPU): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", - "fill": 1, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", - "format": "time_series", - "legendFormat": "{{ scaler }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Scaling metric (CPU): Desired replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Scaling metric (memory): Desired replicas\nThis panel shows the scaling metric exposed by KEDA divided by the target/threshold used.\nIt should represent the desired number of replicas, ignoring the min/max constraints applied later.\n\n", - "fill": 1, - "id": 15, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", - "format": "time_series", - "legendFormat": "{{ scaler }}", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Scaling metric (memory): Desired replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", - "fill": 1, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", - "format": "time_series", - "legendFormat": "{{scaler}} failures", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Autoscaler failures rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - autoscaling", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 17, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-hatracker\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - key-value store for high-availability (HA) deduplication", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", kv_name=~\"distributor-(lifecycler|ring)\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Distributor - key-value store for distributors ring", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", - "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(cortex_kv_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_kv_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\", kv_name=~\"ingester-.*\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester - key-value store for the ingesters ring", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "failed": "#E24D42", - "successful": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Uploaded blocks / sec\nThe rate of blocks being uploaded from the ingesters\nto object storage.\n\n", - "fill": 10, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(cortex_ingester_shipper_uploads_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "successful", - "legendLink": null - }, - { - "expr": "sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "failed", - "legendLink": null - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Uploaded blocks / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "### Upload latency\nThe average, median (50th percentile), and 99th percentile time\nthe ingesters take to upload blocks to object storage.\n\n", - "fill": 1, - "id": 24, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval])) * 1e3 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ingester.*|cortex|mimir|mimir-write.*))\",component=\"ingester\",operation=\"upload\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "Average", - "refId": "C" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Upload latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester - shipper", - "titleSize": "h6" - }, { "collapse": false, "height": "250px", @@ -1990,7 +458,7 @@ "datasource": "$datasource", "description": "### Compactions per second\nIngesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each\nactive time series; these blocks get periodically compacted (by default, every 2h).\nThis panel shows the rate of compaction operations across all TSDBs on all ingesters.\n\n", "fill": 10, - "id": 25, + "id": 7, "legend": { "avg": false, "current": false, @@ -2071,7 +539,7 @@ "datasource": "$datasource", "description": "### Compaction latency\nThe average, median (50th percentile), and 99th percentile time ingesters take to compact TSDB head blocks\non the local filesystem.\n\n", "fill": 1, - "id": 26, + "id": 8, "legend": { "avg": false, "current": false, @@ -2173,7 +641,7 @@ "datasource": "$datasource", "description": "### WAL truncations per second\nThe WAL is truncated each time a new TSDB block is written. This panel measures the rate of\ntruncations.\n\n", "fill": 10, - "id": 27, + "id": 9, "legend": { "avg": false, "current": false, @@ -2257,7 +725,7 @@ "datasource": "$datasource", "description": "### Checkpoints created per second\nCheckpoints are created as part of the WAL truncation process.\nThis metric measures the rate of checkpoint creation.\n\n", "fill": 10, - "id": 28, + "id": 10, "legend": { "avg": false, "current": false, @@ -2339,7 +807,7 @@ "unit": "s" } }, - "id": 29, + "id": 11, "links": [ ], "options": { "legend": { @@ -2372,7 +840,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 30, + "id": 12, "legend": { "avg": false, "current": false, @@ -2465,7 +933,7 @@ "datasource": "$datasource", "description": "### Distributor exemplars incoming rate\nThe rate of exemplars that have come in to the distributor, including rejected or deduped exemplars.\n\n", "fill": 1, - "id": 31, + "id": 13, "legend": { "avg": false, "current": false, @@ -2540,7 +1008,7 @@ "datasource": "$datasource", "description": "### Distributor exemplars received rate\nThe rate of received exemplars, excluding rejected and deduped exemplars.\nThis number can be sensibly lower than incoming rate because we dedupe the HA sent exemplars, and then reject based on time, see `cortex_discarded_exemplars_total` for specific reasons rates.\n\n", "fill": 1, - "id": 32, + "id": 14, "legend": { "avg": false, "current": false, @@ -2615,7 +1083,7 @@ "datasource": "$datasource", "description": "### Ingester ingested exemplars rate\nThe rate of exemplars ingested in the ingesters.\nEvery exemplar is sent to the replication factor number of ingesters, so the sum of rates from all ingesters is divided by the replication factor.\nThis ingested exemplars rate should match the distributor's received exemplars rate.\n\n", "fill": 1, - "id": 33, + "id": 15, "legend": { "avg": false, "current": false, @@ -2690,7 +1158,7 @@ "datasource": "$datasource", "description": "### Ingester appended exemplars rate\nThe rate of exemplars appended in the ingesters.\nThis can be lower than ingested exemplars rate since TSDB does not append the same exemplar twice, and those can be frequent.\n\n", "fill": 1, - "id": 34, + "id": 16, "legend": { "avg": false, "current": false, @@ -2776,7 +1244,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 35, + "id": 17, "legend": { "avg": false, "current": false, @@ -2850,7 +1318,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 36, + "id": 18, "legend": { "avg": false, "current": false, diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 2fea653d923..8628b129ebb 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -48,8 +48,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"'), - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector), + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, }, // Read failures rate as percentage of total requests. @@ -64,8 +64,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"'), - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector), + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, }, }, @@ -89,8 +89,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"'), - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector), + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, }, }, diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index ecbdee30e11..f6d085567d7 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -179,8 +179,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.qpsPanel(selector, statusLabelName) + { yaxes: $.yaxes('reqps') }, - qpsPanelNativeHistogram(selector, statusLabelName='status_code'):: - super.qpsPanelNativeHistogram(selector, statusLabelName) + + qpsPanelNativeHistogram(title, selector, statusLabelName='status_code'):: + super.qpsPanelNativeHistogram(title, selector, statusLabelName) + { yaxes: $.yaxes('reqps') }, // hiddenLegendQueryPanel adds on to 'timeseriesPanel', not the deprecated 'panel'. diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index 34b6209898b..f009ed76693 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -32,6 +32,7 @@ local filename = 'mimir-overview.json'; ($.dashboard('Overview') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowHistoricDataVariable() .addRow( $.row('%(product)s cluster health' % $._config) @@ -113,8 +114,8 @@ local filename = 'mimir-overview.json'; ||| % helpers), ) .addPanel( - $.panel(std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + $.qpsPanelNativeHistogram( + std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '), if $._config.gateway_enabled then $.queries.gateway.writeRequestsPerSecondMetric else diff --git a/operations/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet index c8de4fcc8ea..d95067b579f 100644 --- a/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/operations/mimir-mixin/dashboards/writes.libsonnet @@ -102,153 +102,153 @@ local filename = 'mimir-writes.json'; $.statPanel('sum(%s)' % utils.nativeClassicHistogramCountRate($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector), format='reqps') ) ) - .addRowIf( - $._config.gateway_enabled, - $.row('Gateway') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) - ) - .addPanel( - $.panel('Latency') + - $.latencyPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - utils.nativeClassicHistogramQuantile('0.99', $.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' - ) - ) - ) - .addRow( - $.row('Distributor') - .addPanel( - $.panel('Requests / sec') + - $.panelDescription( - 'Requests / sec', - ||| - The rate of successful, failed and rejected requests to distributor. - Rejected requests are requests that distributor fails to handle because of distributor instance limits. - When distributor is configured to use "early" request rejection, then rejected requests are NOT included in other metrics. - When distributor is not configured to use "early" request rejection, then rejected requests are also counted as "errors". - ||| - ) + - $.qpsPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) + - if $._config.show_rejected_requests_on_writes_dashboard then { - targets: [ - { - legendLink: null, - expr: 'sum (rate(cortex_distributor_instance_rejected_requests_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.distributor)], - format: 'time_series', - intervalFactor: 2, - legendFormat: 'rejected', - refId: 'B', - }, - ] + super.targets, - aliasColors+: { - rejected: '#EAB839', - }, - } else {}, - ) - .addPanel( - $.panel('Latency') + - $.latencyPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - utils.nativeClassicHistogramQuantile('0.99', $.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' - ) - ) - ) - .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($)) - .addRow( - $.row('Ingester') - .addPanel( - $.panel('Requests / sec') + - $.panelDescription( - 'Requests / sec', - ||| - The rate of successful, failed and rejected requests to ingester. - Rejected requests are requests that ingester fails to handle because of ingester instance limits (ingester-max-inflight-push-requests, ingester-max-inflight-push-requests-bytes, ingester-max-ingestion-rate). - When ingester is configured to use "early" request rejection, then rejected requests are NOT included in other metrics. - When ingester is not configured to use "early" request rejection, then rejected requests are also counted as "errors". - ||| - ) + - $.qpsPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) + - if $._config.show_rejected_requests_on_writes_dashboard then { - targets: [ - { - legendLink: null, - expr: 'sum (rate(cortex_ingester_instance_rejected_requests_total{%s, reason=~"ingester_max_inflight_push_requests|ingester_max_ingestion_rate"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], - format: 'time_series', - intervalFactor: 2, - legendFormat: 'rejected', - refId: 'B', - }, - ] + super.targets, - aliasColors+: { - rejected: '#EAB839', - }, - } else {}, - ) - .addPanel( - $.panel('Latency') + - $.latencyPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + - $.hiddenLegendQueryPanel( - utils.nativeClassicHistogramQuantile('0.99', 'cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester), [$._config.per_instance_label]), '' - ) - ) - ) - .addRowIf( - $._config.gateway_enabled && $._config.autoscaling.gateway.enabled, - $.cpuAndMemoryBasedAutoScalingRow('Gateway'), - ) - .addRowIf( - $._config.autoscaling.distributor.enabled, - $.cpuAndMemoryBasedAutoScalingRow('Distributor'), - ) - .addRow( - $.kvStoreRow('Distributor - key-value store for high-availability (HA) deduplication', 'distributor', 'distributor-hatracker') - ) - .addRow( - $.kvStoreRow('Distributor - key-value store for distributors ring', 'distributor', 'distributor-(lifecycler|ring)') - ) - .addRow( - $.kvStoreRow('Ingester - key-value store for the ingesters ring', 'ingester', 'ingester-.*') - ) - .addRow( - $.row('Ingester - shipper') - .addPanel( - $.panel('Uploaded blocks / sec') + - $.successFailurePanel( - 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), - ) + - $.panelDescription( - 'Uploaded blocks / sec', - ||| - The rate of blocks being uploaded from the ingesters - to object storage. - ||| - ) + - $.stack, - ) - .addPanel( - $.panel('Upload latency') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + - $.panelDescription( - 'Upload latency', - ||| - The average, median (50th percentile), and 99th percentile time - the ingesters take to upload blocks to object storage. - ||| - ), - ) - ) + // .addRowIf( + // $._config.gateway_enabled, + // $.row('Gateway') + // .addPanel( + // $.panel('Requests / sec') + + // $.qpsPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) + // ) + // .addPanel( + // $.panel('Latency') + + // $.latencyPanelNativeHistogram($.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector) + // ) + // .addPanel( + // $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + + // $.hiddenLegendQueryPanel( + // utils.nativeClassicHistogramQuantile('0.99', $.queries.gateway.writeRequestsPerSecondMetric, $.queries.gateway.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' + // ) + // ) + // ) + // .addRow( + // $.row('Distributor') + // .addPanel( + // $.panel('Requests / sec') + + // $.panelDescription( + // 'Requests / sec', + // ||| + // The rate of successful, failed and rejected requests to distributor. + // Rejected requests are requests that distributor fails to handle because of distributor instance limits. + // When distributor is configured to use "early" request rejection, then rejected requests are NOT included in other metrics. + // When distributor is not configured to use "early" request rejection, then rejected requests are also counted as "errors". + // ||| + // ) + + // $.qpsPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) + + // if $._config.show_rejected_requests_on_writes_dashboard then { + // targets: [ + // { + // legendLink: null, + // expr: 'sum (rate(cortex_distributor_instance_rejected_requests_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.distributor)], + // format: 'time_series', + // intervalFactor: 2, + // legendFormat: 'rejected', + // refId: 'B', + // }, + // ] + super.targets, + // aliasColors+: { + // rejected: '#EAB839', + // }, + // } else {}, + // ) + // .addPanel( + // $.panel('Latency') + + // $.latencyPanelNativeHistogram($.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector) + // ) + // .addPanel( + // $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + + // $.hiddenLegendQueryPanel( + // utils.nativeClassicHistogramQuantile('0.99', $.queries.distributor.writeRequestsPerSecondMetric, $.queries.distributor.writeRequestsPerSecondSelector, [$._config.per_instance_label]), '' + // ) + // ) + // ) + // .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($)) + // .addRow( + // $.row('Ingester') + // .addPanel( + // $.panel('Requests / sec') + + // $.panelDescription( + // 'Requests / sec', + // ||| + // The rate of successful, failed and rejected requests to ingester. + // Rejected requests are requests that ingester fails to handle because of ingester instance limits (ingester-max-inflight-push-requests, ingester-max-inflight-push-requests-bytes, ingester-max-ingestion-rate). + // When ingester is configured to use "early" request rejection, then rejected requests are NOT included in other metrics. + // When ingester is not configured to use "early" request rejection, then rejected requests are also counted as "errors". + // ||| + // ) + + // $.qpsPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) + + // if $._config.show_rejected_requests_on_writes_dashboard then { + // targets: [ + // { + // legendLink: null, + // expr: 'sum (rate(cortex_ingester_instance_rejected_requests_total{%s, reason=~"ingester_max_inflight_push_requests|ingester_max_ingestion_rate"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + // format: 'time_series', + // intervalFactor: 2, + // legendFormat: 'rejected', + // refId: 'B', + // }, + // ] + super.targets, + // aliasColors+: { + // rejected: '#EAB839', + // }, + // } else {}, + // ) + // .addPanel( + // $.panel('Latency') + + // $.latencyPanelNativeHistogram('cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester)) + // ) + // .addPanel( + // $.timeseriesPanel('Per %s p99 latency' % $._config.per_instance_label) + + // $.hiddenLegendQueryPanel( + // utils.nativeClassicHistogramQuantile('0.99', 'cortex_request_duration_seconds', '%s,route="/cortex.Ingester/Push"' % $.jobMatcher($._config.job_names.ingester), [$._config.per_instance_label]), '' + // ) + // ) + // ) + // .addRowIf( + // $._config.gateway_enabled && $._config.autoscaling.gateway.enabled, + // $.cpuAndMemoryBasedAutoScalingRow('Gateway'), + // ) + // .addRowIf( + // $._config.autoscaling.distributor.enabled, + // $.cpuAndMemoryBasedAutoScalingRow('Distributor'), + // ) + // .addRow( + // $.kvStoreRow('Distributor - key-value store for high-availability (HA) deduplication', 'distributor', 'distributor-hatracker') + // ) + // .addRow( + // $.kvStoreRow('Distributor - key-value store for distributors ring', 'distributor', 'distributor-(lifecycler|ring)') + // ) + // .addRow( + // $.kvStoreRow('Ingester - key-value store for the ingesters ring', 'ingester', 'ingester-.*') + // ) + // .addRow( + // $.row('Ingester - shipper') + // .addPanel( + // $.panel('Uploaded blocks / sec') + + // $.successFailurePanel( + // 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + // 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + // ) + + // $.panelDescription( + // 'Uploaded blocks / sec', + // ||| + // The rate of blocks being uploaded from the ingesters + // to object storage. + // ||| + // ) + + // $.stack, + // ) + // .addPanel( + // $.panel('Upload latency') + + // $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + + // $.panelDescription( + // 'Upload latency', + // ||| + // The average, median (50th percentile), and 99th percentile time + // the ingesters take to upload blocks to object storage. + // ||| + // ), + // ) + // ) .addRow( $.row('Ingester - TSDB head') .addPanel( diff --git a/operations/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json index 790eee25d1b..9db181877f3 100644 --- a/operations/mimir-mixin/jsonnetfile.json +++ b/operations/mimir-mixin/jsonnetfile.json @@ -3,21 +3,17 @@ "dependencies": [ { "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" + "local": { + "directory": "./lib/mixin-utils" } - }, - "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0" + } }, { "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "mixin-utils" + "local": { + "directory": "./lib/grafana-builder" } - }, - "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0" + } } ], "legacyImports": true diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 88f24dec850..ed9dc3944f7 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -3,23 +3,19 @@ "dependencies": [ { "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" + "local": { + "directory": "./lib/grafana-builder" } }, - "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0", - "sum": "LuE0qvWz5gOxRTtPKm7lrjRlZZRemTCQtGykVkL/Dls=" + "version": "" }, { "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "mixin-utils" + "local": { + "directory": "./lib/mixin-utils" } }, - "version": "197e35bdd28eb3f96a8c7ddce0edd94bd6dc59c0", - "sum": "pI+bGWLbOjxVd+i943ECFtqJVQ9lB3/np9tO7h93q3E=" + "version": "" } ], "legacyImports": false diff --git a/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet b/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet new file mode 100644 index 00000000000..7ba37976d9d --- /dev/null +++ b/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet @@ -0,0 +1,683 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +{ + dashboard(title, uid='', datasource='default', datasource_regex=''):: { + // Stuff that isn't materialised. + _nextPanel:: 1, + addRow(row):: self { + // automatically number panels in added rows. + local n = std.length(row.panels), + local nextPanel = super._nextPanel, + local panels = std.makeArray(n, function(i) + row.panels[i] { id: nextPanel + i }), + + _nextPanel: nextPanel + n, + rows+: [row { panels: panels }], + }, + + addTemplate(name, metric_name, label_name, hide=0, allValue=null, includeAll=false, sort=2):: self { + templating+: { + list+: [{ + allValue: allValue, + current: { + text: 'prod', + value: 'prod', + }, + datasource: '$datasource', + hide: hide, + includeAll: includeAll, + label: name, + multi: false, + name: name, + options: [], + query: 'label_values(%s, %s)' % [metric_name, label_name], + refresh: 1, + regex: '', + sort: sort, + tagValuesQuery: '', + tags: [], + tagsQuery: '', + type: 'query', + useTags: false, + }], + }, + }, + + addMultiTemplate(name, metric_name, label_name, hide=0, allValue='.+', sort=2):: self { + templating+: { + list+: [{ + allValue: allValue, + current: { + selected: true, + text: 'All', + value: '$__all', + }, + datasource: '$datasource', + hide: hide, + includeAll: true, + label: name, + multi: true, + name: name, + options: [], + query: 'label_values(%s, %s)' % [metric_name, label_name], + refresh: 1, + regex: '', + sort: sort, + tagValuesQuery: '', + tags: [], + tagsQuery: '', + type: 'query', + useTags: false, + }], + }, + }, + + addShowHistoricDataVariable():: self { + templating+: { + list+: [{ + current: { + selected: true, + text: 'show', + value: '1', + }, + description: 'When setting this option to 1, panels will query and show deprecated low precision histogram metrics.', + hide: 0, + includeAll: false, + label: 'Show historic data', + multi: false, + name: 'show_classic_histograms', + query: 'hide : 0,show : 1', + options: [ + { + selected: false, + text: 'hide', + value: '0' + }, + { + selected: true, + text: 'show', + value: '1' + } + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], + }, + }, + + dashboardLinkUrl(title, url):: self { + links+: [ + { + asDropdown: false, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: [], + targetBlank: true, + title: title, + tooltip: '', + type: 'link', + url: url, + }, + ], + }, + + // Stuff that is materialised. + uid: uid, + annotations: { + list: [], + }, + hideControls: false, + links: [], + rows: [], + schemaVersion: 14, + style: 'dark', + tags: [], + editable: true, + gnetId: null, + graphTooltip: 0, + templating: { + list: [ + { + current: { + text: datasource, + value: datasource, + }, + hide: 0, + label: 'Data source', + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: datasource_regex, + type: 'datasource', + }, + ], + }, + time: { + from: 'now-1h', + to: 'now', + }, + refresh: '10s', + timepicker: { + refresh_intervals: [ + '5s', + '10s', + '30s', + '1m', + '5m', + '15m', + '30m', + '1h', + '2h', + '1d', + ], + time_options: [ + '5m', + '15m', + '1h', + '6h', + '12h', + '24h', + '2d', + '7d', + '30d', + ], + }, + timezone: 'utc', + title: title, + version: 0, + }, + + row(title):: { + _panels:: [], + addPanel(panel):: self { + _panels+: [panel], + }, + + panels: + // Automatically distribute panels within a row. + local n = std.length(self._panels); + [ + p { span: std.floor(12 / n) } + for p in self._panels + ], + + collapse: false, + height: '250px', + repeat: null, + repeatIteration: null, + repeatRowId: null, + showTitle: true, + title: title, + titleSize: 'h6', + }, + + // "graph" type, now deprecated. + panel(title):: { + aliasColors: {}, + bars: false, + dashLength: 10, + dashes: false, + datasource: '$datasource', + fill: 1, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: true, + total: false, + values: false, + }, + lines: true, + linewidth: 1, + links: [], + nullPointMode: 'null as zero', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + spaceLength: 10, + span: 6, + stack: false, + steppedLine: false, + targets: [], + thresholds: [], + timeFrom: null, + timeShift: null, + title: title, + tooltip: { + shared: true, + sort: 2, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + buckets: null, + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: $.yaxes('short'), + }, + + // "timeseries" panel, introduced with Grafana 7.4 and made standard in 8.0. + timeseriesPanel(title):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: 's', + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, + + queryPanel(queries, legends, legendLink=null):: { + + local qs = + if std.type(queries) == 'string' + then [queries] + else queries, + local ls = + if std.type(legends) == 'string' + then [legends] + else legends, + + local qsandls = if std.length(ls) == std.length(qs) + then std.makeArray(std.length(qs), function(x) { q: qs[x], l: ls[x] }) + else error 'length of queries is not equal to length of legends', + + targets+: [ + { + legendLink: legendLink, + expr: ql.q, + format: 'time_series', + legendFormat: ql.l, + } + for ql in qsandls + ], + }, + + statPanel(query, format='percentunit'):: { + type: 'singlestat', + thresholds: '70,80', + format: format, + targets: [ + { + expr: query, + format: 'time_series', + instant: true, + refId: 'A', + }, + ], + }, + + tablePanel(queries, labelStyles):: { + local qs = + if std.type(queries) == 'string' + then [queries] + else queries, + + local style(labelStyle) = + if std.type(labelStyle) == 'string' + then { + alias: labelStyle, + colorMode: null, + colors: [], + dateFormat: 'YYYY-MM-DD HH:mm:ss', + decimals: 2, + thresholds: [], + type: 'string', + unit: 'short', + } + else { + alias: labelStyle.alias, + colorMode: null, + colors: [], + dateFormat: 'YYYY-MM-DD HH:mm:ss', + decimals: if std.objectHas(labelStyle, 'decimals') then labelStyle.decimals else 2, + thresholds: [], + type: if std.objectHas(labelStyle, 'type') then labelStyle.type else 'number', + unit: if std.objectHas(labelStyle, 'unit') then labelStyle.unit else 'short', + link: std.objectHas(labelStyle, 'link'), + linkTargetBlank: if std.objectHas(labelStyle, 'linkTargetBlank') then labelStyle.linkTargetBlank else false, + linkTooltip: if std.objectHas(labelStyle, 'linkTooltip') then labelStyle.linkTooltip else 'Drill down', + linkUrl: if std.objectHas(labelStyle, 'link') then labelStyle.link else '', + }, + + _styles:: { + // By default hide time. + Time: { + alias: 'Time', + dateFormat: 'YYYY-MM-DD HH:mm:ss', + type: 'hidden', + }, + } + { + [label]: style(labelStyles[label]) + for label in std.objectFields(labelStyles) + }, + + styles: [ + self._styles[pattern] { pattern: pattern } + for pattern in std.objectFields(self._styles) + ] + [style('') + { pattern: '/.*/' }], + + transform: 'table', + type: 'table', + targets: [ + { + expr: qs[i], + format: 'table', + instant: true, + legendFormat: '', + refId: std.char(65 + i), + } + for i in std.range(0, std.length(qs) - 1) + ], + }, + + textPanel(title, markdown):: { + type: 'text', + title: title, + options: { + content: markdown, + mode: 'markdown', + }, + transparent: true, + datasource: null, + timeFrom: null, + timeShift: null, + fieldConfig: { + defaults: { + custom: {}, + }, + overrides: [], + }, + }, + + stack:: { + stack: true, + fill: 10, + linewidth: 0, + }, + + yaxes(args):: + local format = if std.type(args) == 'string' then args else null; + local options = if std.type(args) == 'object' then args else {}; + [ + { + format: format, + label: null, + logBase: 1, + max: null, + min: 0, + show: true, + } + options, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: false, + }, + ], + + httpStatusColors:: { + '1xx': '#EAB839', + '2xx': '#7EB26D', + '3xx': '#6ED0E0', + '4xx': '#EF843C', + '5xx': '#E24D42', + OK: '#7EB26D', + success: '#7EB26D', + 'error': '#E24D42', + cancel: '#A9A9A9', + }, + + qpsPanel(selector, statusLabelName='status_code'):: { + aliasColors: $.httpStatusColors, + targets: [ + { + expr: + ||| + sum by (status) ( + label_replace(label_replace(rate(%s[$__rate_interval]), + "status", "${1}xx", "%s", "([0-9]).."), + "status", "${1}", "%s", "([a-zA-Z]+)")) + ||| % [selector, statusLabelName, statusLabelName], + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A', + }, + ], + } + $.stack, + + // Assumes that the metricName is for a histogram (as opposed to qpsPanel above) + // Assumes that there is a dashboard variable named show_classic_histograms, values are 0 or 1 + qpsPanelNativeHistogram(title, metricName, selector, statusLabelName='status_code'):: $.timeseriesPanel(title) { + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', // This will be overridden for classic series to hide those behind. + group: 'A' + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byRegexp', + options: '(historic_)?' + status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)] + [ + // Make the classic histogram query results be in the backround stacked. + { + matcher: { + id: 'byFrameRefID', + options: 'A_classic', + }, + properties: [ + { + id: 'custom.stacking', + value: { + mode: 'normal', + group: 'B', + }, + }, + ], + }, + ], + }, + targets: [ + { + expr: + ||| + sum by (status) ( + label_replace(label_replace(%(metricQuery)s, + "status", "${1}xx", "%(label)s", "([0-9]).."), + "status", "${1}", "%(label)s", "([a-zA-Z]+)")) + ||| % { + metricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).native, + label: statusLabelName, + }, + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A', + }, + { + expr: + ||| + sum by (status) ( + label_replace(label_replace(%(metricQuery)s, + "status", "${1}xx", "%(label)s", "([0-9]).."), + "status", "${1}", "%(label)s", "([a-zA-Z]+)")) + < ($show_classic_histograms * +Inf) + ||| % { + metricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).classic, + label: statusLabelName, + }, + format: 'time_series', + legendFormat: 'historic_{{status}}', + refId: 'A_classic', + }, + ], + } + $.stack, + + latencyPanel(metricName, selector, multiplier='1e3'):: { + nullPointMode: 'null as zero', + targets: [ + { + expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + legendFormat: '99th Percentile', + refId: 'A', + }, + { + expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + legendFormat: '50th Percentile', + refId: 'B', + }, + { + expr: 'sum(rate(%s_sum%s[$__rate_interval])) * %s / sum(rate(%s_count%s[$__rate_interval]))' % [metricName, selector, multiplier, metricName, selector], + format: 'time_series', + legendFormat: 'Average', + refId: 'C', + }, + ], + yaxes: $.yaxes('ms'), + }, + + // Assumes that there is a dashboard variable named show_classic_histograms, values are 0 or 1 + latencyPanelNativeHistogram(metricName, selector, multiplier='1e3'):: { + nullPointMode: 'null as zero', + targets: [ + { + expr: '(%(metricQuery)s) * %(multiplier)s' % { + metricQuery: utils.nativeClassicHistogramQuantile('0.99', metricName, selector).native, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A', + }, + { + expr: '(%(metricQuery)s) * %(multiplier)s < ($show_classic_histograms * +Inf)' % { + metricQuery: utils.nativeClassicHistogramQuantile('0.99', metricName, selector).classic, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A_classic', + }, + { + expr: '(%(metricQuery)s) * %(multiplier)s' % { + metricQuery: utils.nativeClassicHistogramQuantile('0.50', metricName, selector).native, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B', + }, + { + expr: '(%(metricQuery)s) * %(multiplier)s < ($show_classic_histograms * +Inf)' % { + metricQuery: utils.nativeClassicHistogramQuantile('0.50', metricName, selector).classic, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B_classic', + }, + { + expr: + ||| + %(multiplier)s * sum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: utils.nativeClassicHistogramSumRate(metricName, selector).native, + countMetricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).native, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: 'Average', + refId: 'C', + }, + { + expr: + ||| + %(multiplier)s * sum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) < ($show_classic_histograms * +Inf) + ||| % { + sumMetricQuery: utils.nativeClassicHistogramSumRate(metricName, selector).classic, + countMetricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).classic, + multiplier: multiplier, + }, + format: 'time_series', + legendFormat: 'Average', + refId: 'C_classic', + }, + ], + yaxes: $.yaxes('ms'), + }, + + selector:: { + eq(label, value):: { label: label, op: '=', value: value }, + neq(label, value):: { label: label, op: '!=', value: value }, + re(label, value):: { label: label, op: '=~', value: value }, + nre(label, value):: { label: label, op: '!~', value: value }, + }, + + toPrometheusSelector(selector):: + local pairs = [ + '%(label)s%(op)s"%(value)s"' % matcher + for matcher in selector + ]; + '{%s}' % std.join(', ', pairs), +} diff --git a/operations/mimir-mixin/lib/mixin-utils/utils.libsonnet b/operations/mimir-mixin/lib/mixin-utils/utils.libsonnet new file mode 100644 index 00000000000..24ffece8416 --- /dev/null +++ b/operations/mimir-mixin/lib/mixin-utils/utils.libsonnet @@ -0,0 +1,229 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. + // Metric name should be provided without _bucket suffix. + nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval'):: + local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; + local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; + { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))' % { + classicSumBy: classicSumBy, + metric: metric, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))' % { + metric: metric, + nativeSumBy: nativeSumBy, + percentile: percentile, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. + // Metric name should be provided without _sum suffix. + nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + + // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. + // Metric name should be provided without _count suffix. + nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + { + classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + histogramRules(metric, labels, interval='1m'):: + local vars = { + metric: metric, + labels_underscore: std.join('_', labels), + labels_comma: std.join(', ', labels), + interval: interval, + }; + [ + { + record: '%(labels_underscore)s:%(metric)s:99quantile' % vars, + expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:50quantile' % vars, + expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:avg' % vars, + expr: 'sum(rate(%(metric)s_sum[1m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s_bucket:sum_rate' % vars, + expr: 'sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s)' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s_sum:sum_rate' % vars, + expr: 'sum(rate(%(metric)s_sum[%(interval)s])) by (%(labels_comma)s)' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s_count:sum_rate' % vars, + expr: 'sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, + }, + ], + + + // latencyRecordingRulePanel - build a latency panel for a recording rule. + // - metric: the base metric name (middle part of recording rule name) + // - selectors: list of selectors which will be added to first part of + // recording rule name, and to the query selector itself. + // - extra_selectors (optional): list of selectors which will be added to the + // query selector, but not to the beginnig of the recording rule name. + // Useful for external labels. + // - multiplier (optional): assumes results are in seconds, will multiply + // by 1e3 to get ms. Can be turned off. + // - sum_by (optional): additional labels to use in the sum by clause, will also be used in the legend + latencyRecordingRulePanel(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[]):: + local labels = std.join('_', [matcher.label for matcher in selectors]); + local selectorStr = $.toPrometheusSelector(selectors + extra_selectors); + local sb = ['le']; + local legend = std.join('', ['{{ %(lb)s }} ' % lb for lb in sum_by]); + local sumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ''; + local sumByHisto = std.join(',', sb + sum_by); + { + nullPointMode: 'null as zero', + yaxes: g.yaxes('ms'), + targets: [ + { + expr: 'histogram_quantile(0.99, sum by (%(sumBy)s) (%(labels)s:%(metric)s_bucket:sum_rate%(selector)s)) * %(multiplier)s' % { + labels: labels, + metric: metric, + selector: selectorStr, + multiplier: multiplier, + sumBy: sumByHisto, + }, + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A', + }, + { + expr: 'histogram_quantile(0.50, sum by (%(sumBy)s) (%(labels)s:%(metric)s_bucket:sum_rate%(selector)s)) * %(multiplier)s' % { + labels: labels, + metric: metric, + selector: selectorStr, + multiplier: multiplier, + sumBy: sumByHisto, + }, + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B', + }, + { + expr: '%(multiplier)s * sum(%(labels)s:%(metric)s_sum:sum_rate%(selector)s)%(sumBy)s / sum(%(labels)s:%(metric)s_count:sum_rate%(selector)s)%(sumBy)s' % { + labels: labels, + metric: metric, + selector: selectorStr, + multiplier: multiplier, + sumBy: sumBy, + }, + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C', + }, + ], + }, + + selector:: { + eq(label, value):: { label: label, op: '=', value: value }, + neq(label, value):: { label: label, op: '!=', value: value }, + re(label, value):: { label: label, op: '=~', value: value }, + nre(label, value):: { label: label, op: '!~', value: value }, + + // Use with latencyRecordingRulePanel to get the label in the metric name + // but not in the selector. + noop(label):: { label: label, op: 'nop' }, + }, + + toPrometheusSelector(selector):: + local pairs = [ + '%(label)s%(op)s"%(value)s"' % matcher + for matcher in std.filter(function(matcher) matcher.op != 'nop', selector) + ]; + '{%s}' % std.join(', ', pairs), + + // withRunbookURL - Add/Override the runbook_url annotations for all alerts inside a list of rule groups. + // - url_format: an URL format for the runbook, the alert name will be substituted in the URL. + // - groups: the list of rule groups containing alerts. + withRunbookURL(url_format, groups):: + local update_rule(rule) = + if std.objectHas(rule, 'alert') + then rule { + annotations+: { + runbook_url: url_format % rule.alert, + }, + } + else rule; + [ + group { + rules: [ + update_rule(alert) + for alert in group.rules + ], + } + for group in groups + ], + + removeRuleGroup(ruleName):: { + local removeRuleGroup(rule) = if rule.name == ruleName then null else rule, + local currentRuleGroups = super.groups, + groups: std.prune(std.map(removeRuleGroup, currentRuleGroups)), + }, + + removeAlertRuleGroup(ruleName):: { + prometheusAlerts+:: $.removeRuleGroup(ruleName), + }, + + removeRecordingRuleGroup(ruleName):: { + prometheusRules+:: $.removeRuleGroup(ruleName), + }, + + overrideAlerts(overrides):: { + local overrideRule(rule) = + if 'alert' in rule && std.objectHas(overrides, rule.alert) + then rule + overrides[rule.alert] + else rule, + local overrideInGroup(group) = group { rules: std.map(overrideRule, super.rules) }, + prometheusAlerts+:: { + groups: std.map(overrideInGroup, super.groups), + }, + }, + + removeAlerts(alerts):: { + local removeRule(rule) = + if 'alert' in rule && std.objectHas(alerts, rule.alert) + then {} + else rule, + local removeInGroup(group) = group { rules: std.map(removeRule, super.rules) }, + prometheusAlerts+:: { + groups: std.prune(std.map(removeInGroup, super.groups)), + }, + }, +} From 00ba335ba3f3fb00c37fa9f25d474ca04748e259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 14 Feb 2024 10:05:20 +0100 Subject: [PATCH 13/14] Update status panel and reads panel on overview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-overview.json | 262 ++++++++++++++---- .../dashboards/mimir-overview.json | 262 ++++++++++++++---- .../dashboards/dashboard-queries.libsonnet | 32 ++- .../mimir-mixin/dashboards/overview.libsonnet | 22 +- 4 files changed, 458 insertions(+), 120 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index 6099b8578ff..b907653f3a4 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -91,11 +91,31 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($show_classic_histograms * +Inf)", + "instant": false, + "legendFormat": "Writes historic", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n", "instant": false, "legendFormat": "Reads", "range": true }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($show_classic_histograms * +Inf)", + "instant": false, + "legendFormat": "Reads historic", + "range": true + }, { "datasource": { "uid": "$datasource" @@ -589,70 +609,212 @@ "type": "text" }, { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "A_classic" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": "B", + "mode": "normal" + } + } + ] + } + ] + }, "fill": 10, "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, "linewidth": 0, "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, "stack": true, - "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "historic_{{status}}", + "refId": "A_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Read requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "reqps", diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index 6099b8578ff..b907653f3a4 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -91,11 +91,31 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($show_classic_histograms * +Inf)", + "instant": false, + "legendFormat": "Writes historic", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n", "instant": false, "legendFormat": "Reads", "range": true }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($show_classic_histograms * +Inf)", + "instant": false, + "legendFormat": "Reads historic", + "range": true + }, { "datasource": { "uid": "$datasource" @@ -589,70 +609,212 @@ "type": "text" }, { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(historic_)?success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "A_classic" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": "B", + "mode": "normal" + } + } + ] + } + ] + }, "fill": 10, "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, "linewidth": 0, "links": [ ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, "stack": true, - "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" + }, + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "historic_{{status}}", + "refId": "A_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Read requests / sec", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "reqps", diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 8628b129ebb..2beba73417d 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -37,7 +37,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate(sampleType='native'):: ||| ( # gRPC errors are not tracked as 5xx but "error". sum(%(countFailQuery)s) @@ -48,12 +48,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType], + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector)[sampleType], }, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate(sampleType='native'):: ||| ( # gRPC errors are not tracked as 5xx but "error". sum(%(countFailQuery)s) @@ -64,8 +64,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType], + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector)[sampleType], }, }, @@ -78,7 +78,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate(sampleType='native'):: ||| ( # gRPC errors are not tracked as 5xx but "error". sum(%(countFailQuery)s) @@ -89,13 +89,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum(%(countQuery)s) ||| % { - countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, - countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType], + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector)[sampleType], }, }, query_frontend: { + local p = self, readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, @@ -107,16 +110,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars)"}[$__rate_interval]))' % variables, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate(sampleType='native'):: ||| ( - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + ||| % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType], + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector)[sampleType], + }, }, ruler: { diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index f009ed76693..63dac610764 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -53,9 +53,13 @@ local filename = 'mimir-overview.json'; 'Status', [ // Write failures. - if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate, + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate('native') else $.queries.distributor.writeFailuresRate('native'), + // Write failures but from classic histograms. + '%s < ($show_classic_histograms * +Inf)' % if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate('classic') else $.queries.distributor.writeFailuresRate('classic'), // Read failures. - if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate('native') else $.queries.query_frontend.readFailuresRate('native'), + // Read failures but from classic histograms. + '%s < ($show_classic_histograms * +Inf)' % if $._config.gateway_enabled then $.queries.gateway.readFailuresRate('classic') else $.queries.query_frontend.readFailuresRate('classic'), // Rule evaluation failures. $.queries.ruler.evaluations.failuresRate, // Alerting notifications. @@ -84,7 +88,7 @@ local filename = 'mimir-overview.json'; // Object storage failures. $.queries.storage.failuresRate, ], - ['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] + ['Writes', 'Writes historic', 'Reads', 'Reads historic', 'Rule evaluations', 'Alerting notifications', 'Object storage'] ) ) .addPanel( @@ -162,12 +166,16 @@ local filename = 'mimir-overview.json'; ||| % helpers), ) .addPanel( - $.panel(std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + - $.qpsPanel( + $.qpsPanelNativeHistogram( + std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '), + if $._config.gateway_enabled then + $.queries.gateway.readRequestsPerSecondMetric + else + $.queries.query_frontend.readRequestsPerSecondMetric, if $._config.gateway_enabled then - $.queries.gateway.readRequestsPerSecond + $.queries.gateway.readRequestsPerSecondSelector else - $.queries.query_frontend.readRequestsPerSecond + $.queries.query_frontend.readRequestsPerSecondSelector ) ) .addPanel( From 88c37bc6ab2ee9ef224a606e7ed81ce5e2d100c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 14 Feb 2024 11:53:05 +0100 Subject: [PATCH 14/14] Update writes latency panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- .../dashboards/mimir-overview.json | 96 ++++++++++--------- .../dashboards/mimir-overview.json | 96 ++++++++++--------- .../mimir-mixin/dashboards/overview.libsonnet | 11 ++- .../lib/grafana-builder/grafana.libsonnet | 8 +- 4 files changed, 118 insertions(+), 93 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index b907653f3a4..0ff515bce82 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -420,72 +420,82 @@ ] }, { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "s" + }, + "overrides": [ ] }, - "lines": true, - "linewidth": 1, + "id": 6, "links": [ ], "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, - "stack": false, - "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3 < ($show_classic_histograms * +Inf)", + "format": "time_series", + "legendFormat": "Historic 99th percentile", + "refId": "A_classic" + }, + { + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "(histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3 < ($show_classic_histograms * +Inf)", + "format": "time_series", + "legendFormat": "Historic 50th percentile", + "refId": "B_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" + }, + { + "expr": "1e3 * sum(rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "Historic average", + "refId": "C_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Write latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "ms", diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index b907653f3a4..0ff515bce82 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -420,72 +420,82 @@ ] }, { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, "datasource": "$datasource", - "fill": 1, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 1, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "s" + }, + "overrides": [ ] }, - "lines": true, - "linewidth": 1, + "id": 6, "links": [ ], "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ ], - "spaceLength": 10, + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 3, - "stack": false, - "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})) * 1e3", + "expr": "(histogram_quantile(0.99, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3 < ($show_classic_histograms * +Inf)", + "format": "time_series", + "legendFormat": "Historic 99th percentile", + "refId": "A_classic" + }, + { + "expr": "(histogram_quantile(0.50, sum (rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3", "format": "time_series", "legendFormat": "50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:cortex_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}) / sum(cluster_job_route:cortex_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"})", + "expr": "(histogram_quantile(0.50, sum by (le) (rate(cortex_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))) * 1e3 < ($show_classic_histograms * +Inf)", + "format": "time_series", + "legendFormat": "Historic 50th percentile", + "refId": "B_classic" + }, + { + "expr": "1e3 * sum(histogram_sum(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))) /\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n", "format": "time_series", "legendFormat": "Average", "refId": "C" + }, + { + "expr": "1e3 * sum(rate(cortex_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) /\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])) < ($show_classic_histograms * +Inf)\n", + "format": "time_series", + "legendFormat": "Historic average", + "refId": "C_classic" } ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, "title": "Write latency", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, + "type": "timeseries", "yaxes": [ { "format": "ms", diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index 63dac610764..ed2e158ea7f 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -131,11 +131,16 @@ local filename = 'mimir-overview.json'; ) ) .addPanel( - $.panel(std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + ( + $.latencyPanelNativeHistogram( + std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '), if $._config.gateway_enabled then - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)]) + $.queries.gateway.writeRequestsPerSecondMetric + else + $.queries.distributor.writeRequestsPerSecondMetric, + if $._config.gateway_enabled then + $.queries.gateway.writeRequestsPerSecondSelector else - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)]) + $.queries.distributor.writeRequestsPerSecondSelector ) ) .addPanel( diff --git a/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet b/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet index 7ba37976d9d..caae24952e0 100644 --- a/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet +++ b/operations/mimir-mixin/lib/grafana-builder/grafana.libsonnet @@ -596,7 +596,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, // Assumes that there is a dashboard variable named show_classic_histograms, values are 0 or 1 - latencyPanelNativeHistogram(metricName, selector, multiplier='1e3'):: { + latencyPanelNativeHistogram(title, metricName, selector, multiplier='1e3'):: $.timeseriesPanel(title) { nullPointMode: 'null as zero', targets: [ { @@ -614,7 +614,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; multiplier: multiplier, }, format: 'time_series', - legendFormat: '99th percentile', + legendFormat: 'Historic 99th percentile', refId: 'A_classic', }, { @@ -632,7 +632,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; multiplier: multiplier, }, format: 'time_series', - legendFormat: '50th percentile', + legendFormat: 'Historic 50th percentile', refId: 'B_classic', }, { @@ -660,7 +660,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; multiplier: multiplier, }, format: 'time_series', - legendFormat: 'Average', + legendFormat: 'Historic average', refId: 'C_classic', }, ],