From 6c3f34f2dde46ccba7630da9c0c0bac5a2e5956e Mon Sep 17 00:00:00 2001 From: Stephan Hesselmann Date: Tue, 3 Oct 2023 15:11:36 +0200 Subject: [PATCH 1/2] fix(sli): allow `Unknown` status code in SLI error rate (#149) --- resources/prometheus/prometheus-rules.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 0c9c2533..86d62a4b 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -351,9 +351,10 @@ spec: # The error rate over the last 10 minutes must be smaller than 35% to count as available. # GRPC + # TODO(ROX-19917): Re-add `grpc_code="Unknown"` to the list of server errors. - expr: | sum by (namespace, rhacs_instance_id, rhacs_org_id, rhacs_org_name, rhacs_cluster_name, rhacs_environment) - (rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary", grpc_service!="v1.PingService", grpc_code!~"DeadlineExceeded|Internal|Unavailable|Unknown"}[10m])) + (rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary", grpc_service!="v1.PingService", grpc_code!~"DeadlineExceeded|Internal|Unavailable"}[10m])) record: central:grpc_server_handled:server_available_code:rate10m - expr: | From 94d5c7259837a713e121bc74f9212025d30bb00d Mon Sep 17 00:00:00 2001 From: Marcin Owsiany Date: Tue, 10 Oct 2023 13:43:48 +0200 Subject: [PATCH 2/2] Ignore openshift-logging and improve setup (#140) * Improve instructions and Makefile * Ignore openshift-logging. * make generate --- README.md | 4 + resources/mixins/kubernetes/Makefile | 8 +- .../mixins/kubernetes/generated/alerts.yml | 146 +++++++++--------- resources/mixins/kubernetes/mixin.libsonnet | 2 +- .../prometheus/kubernetes-mixin-alerts.yaml | 146 +++++++++--------- 5 files changed, 157 insertions(+), 149 deletions(-) diff --git a/README.md b/README.md index cf1f7249..aec9924a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,10 @@ To make changes to the rhacs dashboards: * Run `make generate` to generate the corresponding resources for the Grafana operator. To make changes to Kubernetes mixin resources: + +First, make sure you have `go-jsonnet` and `jsonnet-bundler` installed. + +Then: * Update `resources/mixins/kubernetes/mixin.libsonnet`. * Run `make generate` to generate the corresponding mixin resources. diff --git a/resources/mixins/kubernetes/Makefile b/resources/mixins/kubernetes/Makefile index 3b053837..218fd04d 100644 --- a/resources/mixins/kubernetes/Makefile +++ b/resources/mixins/kubernetes/Makefile @@ -1,5 +1,9 @@ .PHONY: update -update: update-jb generate +update: install update-jb generate + +.PHONY: install +install: + @jb install github.com/kubernetes-monitoring/kubernetes-mixin .PHONY: update-jb update-jb: @@ -16,4 +20,4 @@ dashboards: templates/dashboards/* @scripts/generate-dashboards.sh .PHONY: generate -generate: alerts dashboards +generate: install alerts dashboards diff --git a/resources/mixins/kubernetes/generated/alerts.yml b/resources/mixins/kubernetes/generated/alerts.yml index ec2e39be..7a270a49 100644 --- a/resources/mixins/kubernetes/generated/alerts.yml +++ b/resources/mixins/kubernetes/generated/alerts.yml @@ -7,7 +7,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" "summary": "Pod is crash looping." "expr": | - max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) >= 1 + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) >= 1 "for": "15m" "labels": "severity": "warning" @@ -20,7 +20,7 @@ "expr": | sum by (namespace, pod, cluster) ( max by(namespace, pod, cluster) ( - kube_pod_status_phase{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} + kube_pod_status_phase{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) @@ -35,9 +35,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch" "summary": "Deployment generation mismatch due to possible roll-back" "expr": | - kube_deployment_status_observed_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_observed_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_deployment_metadata_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_metadata_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "warning" @@ -49,11 +49,11 @@ "summary": "Deployment has not matched the expected number of replicas." "expr": | ( - kube_deployment_spec_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_spec_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > - kube_deployment_status_replicas_available{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_replicas_available{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) and ( - changes(kube_deployment_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) + changes(kube_deployment_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) == 0 ) @@ -67,7 +67,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck" "summary": "Deployment rollout is not progressing." "expr": | - kube_deployment_status_condition{condition="Progressing", status="false",namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_condition{condition="Progressing", status="false",namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != 0 "for": "15m" "labels": @@ -80,11 +80,11 @@ "summary": "StatefulSet has not matched the expected number of replicas." "expr": | ( - kube_statefulset_status_replicas_ready{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas_ready{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_status_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) and ( - changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) + changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) == 0 ) @@ -98,9 +98,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch" "summary": "StatefulSet generation mismatch due to possible roll-back" "expr": | - kube_statefulset_status_observed_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_observed_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_metadata_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_metadata_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "warning" @@ -113,18 +113,18 @@ "expr": | ( max without (revision) ( - kube_statefulset_status_current_revision{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_current_revision{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} unless - kube_statefulset_status_update_revision{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_update_revision{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) * ( - kube_statefulset_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) ) and ( - changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) + changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) == 0 ) @@ -140,24 +140,24 @@ "expr": | ( ( - kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) or ( - kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != 0 ) or ( - kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) or ( - kube_daemonset_status_number_available{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_number_available{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) ) and ( - changes(kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) + changes(kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) == 0 ) @@ -171,7 +171,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting" "summary": "Pod container waiting longer than 1 hour" "expr": | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) > 0 + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) > 0 "for": "1h" "labels": "severity": "warning" @@ -182,9 +182,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled" "summary": "DaemonSet pods are not scheduled." "expr": | - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} - - kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "10m" "labels": "severity": "warning" @@ -195,7 +195,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled" "summary": "DaemonSet pods are misscheduled." "expr": | - kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "15m" "labels": "severity": "warning" @@ -206,9 +206,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted" "summary": "Job did not complete in time" "expr": | - time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} and - kube_job_status_active{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0) > 43200 + kube_job_status_active{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0) > 43200 "labels": "severity": "warning" "source": "mixin/kubernetes" @@ -218,7 +218,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed" "summary": "Job failed to complete." "expr": | - kube_job_failed{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_job_failed{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "15m" "labels": "severity": "warning" @@ -229,19 +229,19 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch" "summary": "HPA has not matched desired number of replicas." "expr": | - (kube_horizontalpodautoscaler_status_desired_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_desired_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > - kube_horizontalpodautoscaler_spec_min_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_min_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} < - kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - changes(kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[15m]) == 0 "for": "15m" "labels": "severity": "warning" @@ -252,9 +252,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout" "summary": "HPA is running at max replicas" "expr": | - kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} == - kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "info" @@ -293,7 +293,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit" "summary": "Cluster has overcommitted CPU resource requests." "expr": | - sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) + sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) / sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5 @@ -307,7 +307,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit" "summary": "Cluster has overcommitted memory resource requests." "expr": | - sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) + sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) / sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5 @@ -321,9 +321,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull" "summary": "Namespace quota is going to be full." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1 "for": "15m" "labels": @@ -335,9 +335,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused" "summary": "Namespace quota is fully used." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) == 1 "for": "15m" "labels": @@ -349,9 +349,9 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded" "summary": "Namespace quota has exceeded the limits." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) > 1 "for": "15m" "labels": @@ -380,16 +380,16 @@ "summary": "PersistentVolume is filling up." "expr": | ( - kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.03 and - kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1m" "labels": "severity": "critical" @@ -401,18 +401,18 @@ "summary": "PersistentVolume is filling up." "expr": | ( - kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.15 and - kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 and - predict_linear(kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 + predict_linear(kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1h" "labels": "severity": "warning" @@ -424,16 +424,16 @@ "summary": "PersistentVolumeInodes are filling up." "expr": | ( - kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.03 and - kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1m" "labels": "severity": "critical" @@ -445,18 +445,18 @@ "summary": "PersistentVolumeInodes are filling up." "expr": | ( - kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.15 and - kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 and - predict_linear(kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 + predict_linear(kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1h" "labels": "severity": "warning" @@ -467,7 +467,7 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors" "summary": "PersistentVolume is having issues with provisioning." "expr": | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "5m" "labels": "severity": "critical" diff --git a/resources/mixins/kubernetes/mixin.libsonnet b/resources/mixins/kubernetes/mixin.libsonnet index 80c5df2c..52bd2495 100644 --- a/resources/mixins/kubernetes/mixin.libsonnet +++ b/resources/mixins/kubernetes/mixin.libsonnet @@ -10,7 +10,7 @@ kubernetes { kubeApiserverSelector: 'job="api"', kubeProxySelector: 'job="machine-config-daemon"', kubeSchedulerSelector: 'job="scheduler"', - namespaceSelector: 'namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*"', + namespaceSelector: 'namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*"', }, } + { // Customize alert labels. diff --git a/resources/prometheus/kubernetes-mixin-alerts.yaml b/resources/prometheus/kubernetes-mixin-alerts.yaml index 79d5c7c4..fedcccfe 100644 --- a/resources/prometheus/kubernetes-mixin-alerts.yaml +++ b/resources/prometheus/kubernetes-mixin-alerts.yaml @@ -14,7 +14,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" "summary": "Pod is crash looping." "expr": | - max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) >= 1 + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) >= 1 "for": "15m" "labels": "severity": "warning" @@ -27,7 +27,7 @@ spec: "expr": | sum by (namespace, pod, cluster) ( max by(namespace, pod, cluster) ( - kube_pod_status_phase{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} + kube_pod_status_phase{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) @@ -42,9 +42,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch" "summary": "Deployment generation mismatch due to possible roll-back" "expr": | - kube_deployment_status_observed_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_observed_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_deployment_metadata_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_metadata_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "warning" @@ -56,11 +56,11 @@ spec: "summary": "Deployment has not matched the expected number of replicas." "expr": | ( - kube_deployment_spec_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_spec_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > - kube_deployment_status_replicas_available{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_replicas_available{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) and ( - changes(kube_deployment_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) + changes(kube_deployment_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) == 0 ) @@ -74,7 +74,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck" "summary": "Deployment rollout is not progressing." "expr": | - kube_deployment_status_condition{condition="Progressing", status="false",namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_deployment_status_condition{condition="Progressing", status="false",namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != 0 "for": "15m" "labels": @@ -87,11 +87,11 @@ spec: "summary": "StatefulSet has not matched the expected number of replicas." "expr": | ( - kube_statefulset_status_replicas_ready{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas_ready{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_status_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) and ( - changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) + changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[10m]) == 0 ) @@ -105,9 +105,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch" "summary": "StatefulSet generation mismatch due to possible roll-back" "expr": | - kube_statefulset_status_observed_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_observed_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_metadata_generation{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_metadata_generation{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "warning" @@ -120,18 +120,18 @@ spec: "expr": | ( max without (revision) ( - kube_statefulset_status_current_revision{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_current_revision{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} unless - kube_statefulset_status_update_revision{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_update_revision{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) * ( - kube_statefulset_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) ) and ( - changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) + changes(kube_statefulset_status_replicas_updated{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) == 0 ) @@ -147,24 +147,24 @@ spec: "expr": | ( ( - kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) or ( - kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != 0 ) or ( - kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) or ( - kube_daemonset_status_number_available{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_number_available{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} ) ) and ( - changes(kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) + changes(kube_daemonset_status_updated_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[5m]) == 0 ) @@ -178,7 +178,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting" "summary": "Pod container waiting longer than 1 hour" "expr": | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) > 0 + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) > 0 "for": "1h" "labels": "severity": "warning" @@ -189,9 +189,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled" "summary": "DaemonSet pods are not scheduled." "expr": | - kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_daemonset_status_desired_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} - - kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_daemonset_status_current_number_scheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "10m" "labels": "severity": "warning" @@ -202,7 +202,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled" "summary": "DaemonSet pods are misscheduled." "expr": | - kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_daemonset_status_number_misscheduled{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "15m" "labels": "severity": "warning" @@ -213,9 +213,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted" "summary": "Job did not complete in time" "expr": | - time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} and - kube_job_status_active{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0) > 43200 + kube_job_status_active{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0) > 43200 "labels": "severity": "warning" "source": "mixin/kubernetes" @@ -225,7 +225,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed" "summary": "Job failed to complete." "expr": | - kube_job_failed{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_job_failed{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "15m" "labels": "severity": "warning" @@ -236,19 +236,19 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch" "summary": "HPA has not matched desired number of replicas." "expr": | - (kube_horizontalpodautoscaler_status_desired_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_desired_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} != - kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > - kube_horizontalpodautoscaler_spec_min_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_min_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + (kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} < - kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) + kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}) and - changes(kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"}[15m]) == 0 "for": "15m" "labels": "severity": "warning" @@ -259,9 +259,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout" "summary": "HPA is running at max replicas" "expr": | - kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_horizontalpodautoscaler_status_current_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} == - kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} + kube_horizontalpodautoscaler_spec_max_replicas{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} "for": "15m" "labels": "severity": "info" @@ -300,7 +300,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit" "summary": "Cluster has overcommitted CPU resource requests." "expr": | - sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) + sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) / sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5 @@ -314,7 +314,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit" "summary": "Cluster has overcommitted memory resource requests." "expr": | - sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) + sum(min without(resource) (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) / sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5 @@ -328,9 +328,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull" "summary": "Namespace quota is going to be full." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1 "for": "15m" "labels": @@ -342,9 +342,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused" "summary": "Namespace quota is fully used." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) == 1 "for": "15m" "labels": @@ -356,9 +356,9 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded" "summary": "Namespace quota has exceeded the limits." "expr": | - kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} + kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics", type="hard"} > 0) > 1 "for": "15m" "labels": @@ -387,16 +387,16 @@ spec: "summary": "PersistentVolume is filling up." "expr": | ( - kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.03 and - kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1m" "labels": "severity": "critical" @@ -408,18 +408,18 @@ spec: "summary": "PersistentVolume is filling up." "expr": | ( - kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_capacity_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.15 and - kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_used_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 and - predict_linear(kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 + predict_linear(kubelet_volume_stats_available_bytes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1h" "labels": "severity": "warning" @@ -431,16 +431,16 @@ spec: "summary": "PersistentVolumeInodes are filling up." "expr": | ( - kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.03 and - kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1m" "labels": "severity": "critical" @@ -452,18 +452,18 @@ spec: "summary": "PersistentVolumeInodes are filling up." "expr": | ( - kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} / - kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} + kubelet_volume_stats_inodes{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} ) < 0.15 and - kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 + kubelet_volume_stats_inodes_used{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"} > 0 and - predict_linear(kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 + predict_linear(kubelet_volume_stats_inodes_free{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 + kube_persistentvolumeclaim_access_mode{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*", access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 + kube_persistentvolumeclaim_labels{namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",label_excluded_from_alerts="true"} == 1 "for": "1h" "labels": "severity": "warning" @@ -474,7 +474,7 @@ spec: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors" "summary": "PersistentVolume is having issues with provisioning." "expr": | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace!~"openshift-kube.*|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 + kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace!~"openshift-kube.*|openshift-logging|openshift-marketplace|openshift-deployment.*|kube.*",job="kube-state-metrics"} > 0 "for": "5m" "labels": "severity": "critical"