From 3aa58300251c24ac562d5ccfc669f7a6b57c20af Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 14:06:00 +0200 Subject: [PATCH 1/3] Fix scaling dashboard to work on multi-zone ingesters Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + cortex-mixin/recording_rules.libsonnet | 63 +++++++++++++++++++------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4525e2ea..dfc25038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 +* [BUGFIX] Fixed scaling dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). #365 ## 1.9.0 / 2021-05-18 diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 00c7d701..c7034cd8 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -69,12 +69,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ { // Convenience rule to get the number of replicas for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:actual_replicas:count', expr: ||| - sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas) - or sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)") + label_replace( + kube_deployment_spec_replicas, + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)"), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) ) |||, }, @@ -188,7 +197,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| ceil( (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"} + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} ) / 4) / avg by (cluster, namespace) ( @@ -199,18 +208,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // Convenience rule to get the CPU utilization for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate', expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) |||, }, { // Convenience rule to get the CPU request for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum', expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 @@ -223,8 +237,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -234,8 +251,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -261,18 +281,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // Convenience rule to get the Memory utilization for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:container_memory_usage_bytes:sum', expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - container_memory_usage_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + container_memory_usage_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) |||, }, { // Convenience rule to get the Memory request for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum', expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 @@ -285,8 +310,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -296,8 +324,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) From 65ac104f37ae53dc6bce8ff8b4fb0b4e436d2baf Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 14:58:04 +0200 Subject: [PATCH 2/3] Simplified cluster_namespace_deployment:actual_replicas:count recording rule Signed-off-by: Marco Pracucci --- cortex-mixin/recording_rules.libsonnet | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index c7034cd8..e9606486 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -80,10 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) or sum by (cluster, namespace, deployment) ( - label_replace( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)"), - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") ) |||, }, From 28561cbd1b8cbf1fd5a4b5adca7aa7f60c4e6b9b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 16:08:52 +0200 Subject: [PATCH 3/3] Added a comment to explain '.*?' Signed-off-by: Marco Pracucci --- cortex-mixin/recording_rules.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index e9606486..433fa8e6 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -75,6 +75,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum by (cluster, namespace, deployment) ( label_replace( kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -214,6 +216,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -238,6 +242,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests_cpu_cores, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -252,6 +258,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests{resource="cpu"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -287,6 +295,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; container_memory_usage_bytes, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -311,6 +321,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests_memory_bytes, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -325,6 +337,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests{resource="memory"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) )