From 77f08d82cc628307b4a374b1349998f8b6700750 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 9 May 2024 11:25:02 +0200 Subject: [PATCH] fix: mimir/loki and tempo recording rules to be able to work with Mimir (#1158) --- CHANGELOG.md | 1 + .../recording-rules/loki-mixins.rules.yml | 40 ++-- .../recording-rules/mimir-mixins.rules.yml | 218 +++++++++--------- .../recording-rules/tempo-mixins.rules.yml | 12 +- 4 files changed, 136 insertions(+), 135 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec7d48308..03b078eca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Remove cilium entry from KAAS SLOs. +- Fix Loki/Mimir and Tempo mixins according to `pint` recommendations - Fix cilium related alerts for mimir. - Fix etcd alerts for Mimir. - Add missing labels for apiserver alerts. diff --git a/helm/prometheus-rules/templates/recording-rules/loki-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/loki-mixins.rules.yml index fddb7a9d6..5b15f4f39 100644 --- a/helm/prometheus-rules/templates/recording-rules/loki-mixins.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/loki-mixins.rules.yml @@ -10,52 +10,52 @@ spec: - name: loki_rules rules: - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster_id, provider, installation, pipeline, job) record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job) record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml index b84038a46..d999f1df1 100644 --- a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml @@ -11,66 +11,66 @@ spec: - name: mimir_api_1 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - by (cluster, job) + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / sum(rate(cortex_request_duration_seconds_count[1m])) + by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_request_duration_seconds_count:sum_rate - name: mimir_api_2 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:cortex_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, route) + / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate - name: mimir_api_3 rules: - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate - name: mimir_querier_api rules: - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_querier_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_querier_request_duration_seconds:50quantile - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, @@ -86,14 +86,14 @@ spec: job) record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) + by (le, cluster_id, provider, installation, pipeline, job, route)) record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by - (cluster, job, route) + (cluster_id, provider, installation, pipeline, job, route) record: cluster_job_route:cortex_querier_request_duration_seconds:avg - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) @@ -105,14 +105,14 @@ spec: job, route) record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) + by (le, cluster_id, provider, installation, pipeline, namespace, job, route)) record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) - by (cluster, namespace, job, route) + by (cluster_id, provider, installation, pipeline, namespace, job, route) record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) @@ -126,14 +126,14 @@ spec: - name: mimir_cache rules: - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) + by (le, cluster_id, provider, installation, pipeline, job, method)) record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) + by (le, cluster_id, provider, installation, pipeline, job, method)) record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) - by (cluster, job, method) + by (cluster_id, provider, installation, pipeline, job, method) record: cluster_job_method:cortex_memcache_request_duration_seconds:avg - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method) @@ -145,36 +145,36 @@ spec: job, method) record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_cache_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_cache_request_duration_seconds:50quantile - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) + / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_cache_request_duration_seconds:avg - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job) record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) + by (le, cluster_id, provider, installation, pipeline, job, method)) record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) + by (le, cluster_id, provider, installation, pipeline, job, method)) record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method) record: cluster_job_method:cortex_cache_request_duration_seconds:avg - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method) record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job, method) record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, @@ -183,50 +183,50 @@ spec: - name: mimir_storage rules: - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_kv_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_kv_request_duration_seconds:50quantile - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) + / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_kv_request_duration_seconds:avg - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job) record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate - name: mimir_queries rules: - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_query_frontend_retries:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_query_frontend_retries:50quantile - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - by (cluster, job) + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / sum(rate(cortex_query_frontend_retries_count[1m])) + by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_retries:avg - - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_retries_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_retries_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by - (cluster, job) + (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - cluster, job) + cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) @@ -237,82 +237,82 @@ spec: - name: mimir_ingester_queries rules: - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_series:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_series:50quantile - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / sum(rate(cortex_ingester_queried_series_count[1m])) + by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_series:avg - - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_series_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_series_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_samples:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_samples:50quantile - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) + by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_samples:avg - - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_samples_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_exemplars:99quantile - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) + by (le, cluster_id, provider, installation, pipeline, job)) record: cluster_job:cortex_ingester_queried_exemplars:50quantile - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster_id, provider, installation, pipeline, job) / + sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_exemplars:avg - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job) record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster_id, provider, installation, pipeline, job) record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate - name: mimir_received_samples rules: - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + sum by (cluster_id, provider, installation, pipeline, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) record: cluster_namespace_job:cortex_distributor_received_samples:rate5m - name: mimir_exemplars_in rules: - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + sum by (cluster_id, provider, installation, pipeline, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m - name: mimir_received_exemplars rules: - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + sum by (cluster_id, provider, installation, pipeline, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m - name: mimir_exemplars_ingested rules: - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + sum by (cluster_id, provider, installation, pipeline, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m - name: mimir_exemplars_appended rules: - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + sum by (cluster_id, provider, installation, pipeline, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m - name: mimir_scaling_rules rules: - expr: | # Convenience rule to get the number of replicas for both a deployment and a statefulset. # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( kube_deployment_spec_replicas, # The question mark in "(.*?)" is used to make it non-greedy, otherwise it @@ -321,14 +321,14 @@ spec: ) ) or - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") ) record: cluster_namespace_deployment:actual_replicas:count - expr: | ceil( quantile_over_time(0.99, - sum by (cluster, namespace) ( + sum by (cluster_id, provider, installation, pipeline, namespace) ( cluster_namespace_job:cortex_distributor_received_samples:rate5m )[24h:] ) @@ -340,7 +340,7 @@ spec: record: cluster_namespace_deployment_reason:required_replicas:count - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (cluster_id, provider, installation, pipeline, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) * 0.59999999999999998 / 240000 ) labels: @@ -350,7 +350,7 @@ spec: - expr: | ceil( quantile_over_time(0.99, - sum by (cluster, namespace) ( + sum by (cluster_id, provider, installation, pipeline, namespace) ( cluster_namespace_job:cortex_distributor_received_samples:rate5m )[24h:] ) @@ -363,7 +363,7 @@ spec: - expr: | ceil( quantile_over_time(0.99, - sum by(cluster, namespace) ( + sum by(cluster_id, provider, installation, pipeline, namespace) ( cortex_ingester_memory_series )[24h:] ) @@ -375,7 +375,7 @@ spec: record: cluster_namespace_deployment_reason:required_replicas:count - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + sum by (cluster_id, provider, installation, pipeline, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) * 3 * 0.59999999999999998 / 1500000 ) labels: @@ -384,7 +384,7 @@ spec: record: cluster_namespace_deployment_reason:required_replicas:count - expr: | ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (cluster_id, provider, installation, pipeline, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) * 0.59999999999999998 / 80000 ) labels: @@ -393,11 +393,11 @@ spec: record: cluster_namespace_deployment_reason:required_replicas:count - expr: | ceil( - (sum by (cluster, namespace) ( + (sum by (cluster_id, provider, installation, pipeline, namespace) ( cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} ) / 4) / - avg by (cluster, namespace) ( + avg by (cluster_id, provider, installation, pipeline, namespace) ( memcached_limit_bytes{job=~".+/memcached"} ) ) @@ -406,7 +406,7 @@ spec: reason: active_series record: cluster_namespace_deployment_reason:required_replicas:count - expr: | - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate, @@ -429,7 +429,7 @@ spec: # This is the old expression, compatible with kube-state-metrics < v2.0.0, # where kube_pod_container_resource_requests_cpu_cores was removed: ( - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests_cpu_cores, @@ -445,7 +445,7 @@ spec: # This expression is compatible with kube-state-metrics >= v1.4.0, # where kube_pod_container_resource_requests was introduced. ( - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests{resource="cpu"}, @@ -475,7 +475,7 @@ spec: - expr: | # Convenience rule to get the Memory utilization for both a deployment and a statefulset. # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( container_memory_usage_bytes{image!=""}, @@ -498,7 +498,7 @@ spec: # This is the old expression, compatible with kube-state-metrics < v2.0.0, # where kube_pod_container_resource_requests_memory_bytes was removed: ( - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests_memory_bytes, @@ -514,7 +514,7 @@ spec: # This expression is compatible with kube-state-metrics >= v1.4.0, # where kube_pod_container_resource_requests was introduced. ( - sum by (cluster, namespace, deployment) ( + sum by (cluster_id, provider, installation, pipeline, namespace, deployment) ( label_replace( label_replace( kube_pod_container_resource_requests{resource="memory"}, @@ -544,38 +544,38 @@ spec: - name: mimir_alertmanager_rules rules: - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_alerts) + sum by (cluster_id, provider, installation, pipeline, job, pod) (cortex_alertmanager_alerts) record: cluster_job_pod:cortex_alertmanager_alerts:sum - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_silences) + sum by (cluster_id, provider, installation, pipeline, job, pod) (cortex_alertmanager_silences) record: cluster_job_pod:cortex_alertmanager_silences:sum - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_alerts_received_total[5m])) record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_state_replication_total[5m])) record: cluster_job:cortex_alertmanager_state_replication_total:rate5m - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + sum by (cluster_id, provider, installation, pipeline, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m - name: mimir_ingester_rules rules: - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) + sum by(cluster_id, provider, installation, pipeline, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m {{- end }} diff --git a/helm/prometheus-rules/templates/recording-rules/tempo-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/tempo-mixins.rules.yml index 966175ae2..32c8d38db 100644 --- a/helm/prometheus-rules/templates/recording-rules/tempo-mixins.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/tempo-mixins.rules.yml @@ -9,15 +9,15 @@ spec: groups: - name: tempo_rules rules: - - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, namespace, job, route))" record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile" - - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, namespace, job, route))" record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile" - - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg" - - expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster_id, provider, installation, pipeline, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate" - - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate" - - expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + - expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster_id, provider, installation, pipeline, namespace, job, route)" record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate"