diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b53d4268..71b844d4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Removed + +- Remove the `mimir.enabled` property to replace it with the MC flavor as all CAPI MCs now run Mimir. + +## [4.24.1] - 2024-11-12 + +### Fixed + +- Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing. + +## [4.24.0] - 2024-11-12 + +### Added + +- Add a set of sensible alerts to monitor alloy. + - `AlloySlowComponentEvaluations` and `AlloyUnhealthyComponents` to report about alloy component state. + - `LoggingAgentDown` to be alerted when the logging agent is down. + - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. + - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. + - `MonitoringAgentDown` to be alerted when the monitoring agent is down. + - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards. + +### Changed + +- Update `DeploymentNotSatisfiedAtlas` to take into account the following components: + - `observability-operator` + - `alloy-rules` + - `observability-gateway` +- Move all `grafana-cloud` related alerts to their own file. +- Move all alloy related alerts to the alloy alert file. +- Rename and move the following alerts as they are not specific to Prometheus: + - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure` + - `PrometheusJobScrapingFailure` => `JobScrapingFailure` + - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors` + ## [4.23.0] - 2024-10-30 ### Changed @@ -19,6 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixes the statefulset.rules name as it is currently replacing the deployment.rules alerts. +- Extends AppCR-related alerts with cancelation for CAPI clusters with unavailable control plane. ## [4.22.0] - 2024-10-29 @@ -3190,7 +3226,9 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.23.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.1...HEAD +[4.24.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.0...v4.24.1 +[4.24.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.23.0...v4.24.0 [4.23.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.22.0...v4.23.0 [4.22.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.21.1...v4.22.0 [4.21.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.21.0...v4.21.1 diff --git a/README.md b/README.md index a704e0a98..c6e50b285 100644 --- a/README.md +++ b/README.md @@ -168,11 +168,11 @@ There are 2 kinds of tests on rules: ``` [...] ### Testing platform/atlas/alerting-rules/prometheus-operator.rules.yml - ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-operator.rules.yml + ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus-operator.rules.yml ### Skipping platform/atlas/alerting-rules/prometheus-operator.rules.yml: listed in test/conf/promtool_ignore ### Testing platform/atlas/alerting-rules/prometheus.rules.yml - ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus.rules.yml - ### promtool test rules prometheus.rules.test.yml - capi/capa-mimir + ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus.rules.yml + ### promtool test rules prometheus.rules.test.yml - capi/capa [...] 09:06:29 promtool: end (Elapsed time: 1s) Congratulations! Prometheus rules have been promtool checked and tested diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml index 14a958308..e56b51a35 100644 --- a/helm/prometheus-rules/Chart.yaml +++ b/helm/prometheus-rules/Chart.yaml @@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png name: prometheus-rules appVersion: '0.1.0' -version: '4.23.0' +version: '4.24.1' annotations: application.giantswarm.io/team: "atlas" config.giantswarm.io/version: 1.x.x diff --git a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml index 5bb93b84f..54d8d51f4 100644 --- a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml +++ b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/prometheus-rules/templates/alloy-rules.yaml b/helm/prometheus-rules/templates/alloy-rules.yaml index ef23d1911..0132c9899 100644 --- a/helm/prometheus-rules/templates/alloy-rules.yaml +++ b/helm/prometheus-rules/templates/alloy-rules.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: application.giantswarm.io/v1alpha1 kind: App metadata: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml index 2f5e080f6..24863fe14 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml @@ -5,9 +5,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: aws-load-balancer-controller.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml index 104e18863..6d2ace5c3 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: node.aws.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml index 1306de635..db06f9b0b 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml @@ -5,7 +5,7 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" {{- end }} name: aws.workload-cluster.rules diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml index 1e9cdb2e7..32d0848fb 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml @@ -6,9 +6,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: capa.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml index 993ca2c07..e1fd083d4 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml @@ -3,9 +3,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: irsa.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml index e0877f4f5..d69bcdc10 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: apiserver.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml index d23245c87..f26e64816 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: apiserver.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml index aed92be3a..aba6ac4d5 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml @@ -4,9 +4,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4}} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: capi.management-cluster.rules namespace: {{.Values.namespace}} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml index c113c46d6..db0538d2a 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: certificate.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml index 70def5eee..86027745b 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: certificate.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml index c47475cb5..c44e1e9e9 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml @@ -5,9 +5,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: cluster-autoscaler.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml index b28bdeceb..790646a89 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: etcd.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml index 222edb370..44aa8e9fc 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: etcd.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml index 7dea38eeb..4291a1a72 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: etcdbackup.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml index 735a771dc..984fa7070 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: inhibit.nodes.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml index 17865dc57..6f8fa87c2 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml index d67f64279..5ab9ac304 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: node.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml index 0507246f3..6a30a570e 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: node.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml index 61dced935..0bd99a509 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml @@ -4,7 +4,7 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" {{- end }} name: pods.core.rules diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml index 9f27fb3c1..591515777 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: core.storage.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml index 72b7d6e06..a19a9035d 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: core.storage.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml index 988bce7e7..60d9a16e1 100644 --- a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml +++ b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml new file mode 100644 index 000000000..80c5361a5 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -0,0 +1,193 @@ +# This files describe common alloy alerting rules +# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: alloy.rules + namespace: {{ .Values.namespace }} +spec: + groups: + # List of alerts on the state of the alloy components. + # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet + # We added the aggregations and our internal labels. + - name: alloy.controller + rules: + - alert: AlloySlowComponentEvaluations + annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_id {{ $labels.component_id }}.`}}' + opsrecipe: alloy/ + summary: Component evaluations are taking too long. + expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 + for: 15m + labels: + area: platform + severity: notify + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + - alert: AlloyUnhealthyComponents + annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}' + opsrecipe: alloy/ + summary: Unhealthy components detected. + expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + - name: alloy.rules + rules: + - alert: AlloyForPrometheusRulesDown + annotations: + description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.' + opsrecipe: prometheus-rules/ + expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - name: alloy.logs + rules: + # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers + - alert: LoggingAgentDown + annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview + description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' + opsrecipe: alloy/ + expr: |- + kube_pod_info{pod=~"alloy-logs.*"} + * on(cluster_id, pod) + group_left () + up{job="alloy-logs", container="alloy"} == 0 + for: 30m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + - name: alloy.metrics + rules: + # This alert pages if monitoring-agent fails to send samples to its remote write endpoint. + - alert: MonitoringAgentDown + annotations: + description: '{{`Monitoring agent fails to send samples.`}}' + summary: Monitoring agent fails to send samples to remote write endpoint. + opsrecipe: alloy/#monitoring-agent-down + dashboard: promRW001/prometheus-remote-write + expr: |- + count( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) by (cluster_id, installation, pipeline, provider) > 0 + unless on (cluster_id) ( + count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id) + ) + for: 20m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page. + - alert: InhibitionMonitoringAgentDown + annotations: + description: '{{`Monitoring agent fails to send samples.`}}' + summary: Monitoring agent fails to send samples to remote write endpoint. + opsrecipe: alloy/#monitoring-agent-down + dashboard: promRW001/prometheus-remote-write + expr: |- + count( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) by (cluster_id, installation, pipeline, provider) > 0 + unless on (cluster_id) ( + count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id) + ) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + ## This alert pages if any of the monitoring-agent shard is not running. + - alert: MonitoringAgentShardsNotSatisfied + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Monitoring agent is missing shards. + opsrecipe: alloy/#monitoring-agent-down + expr: |- + kube_statefulset_status_replicas{statefulset="alloy-metrics"} + - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"} + > 0 + for: 40m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page. + - alert: InhibitionMonitoringAgentShardsNotSatisfied + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Monitoring agent is missing shards. + opsrecipe: alloy/#monitoring-agent-down + expr: |- + kube_statefulset_status_replicas{statefulset="alloy-metrics"} + - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"} + > 0 + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 6d62a35bc..be6a9f5a2 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: deployment.management-cluster.rules namespace: {{ .Values.namespace }} spec: @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|tempo.*|pyroscope.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alloy-rules.*|alertmanager.*|grafana.*|logging-operator.*|loki.*|mimir.*|oauth2-proxy.*|object-storage.*|observability-gateway.*|observability-operator.*|prometheus.*|promxy.*|tempo.*|pyroscope.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: area: platform @@ -95,7 +95,7 @@ spec: team: phoenix topic: managementcluster {{- if eq .Values.managementCluster.provider.flavor "vintage" }} - ## TODO Remove when all vintage clusters are gone + ## TODO(@giantswarm/team-atlas) Remove when all vintage clusters are gone - alert: AWSManagementClusterDeploymentScaledDownToZero annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} on AWS has been scaled down to zero for prolonged period of time.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index fa9087331..ca7422b1d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end}} + {{- end }} name: deployment.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml similarity index 72% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml index 40d76d3d2..2022f4fde 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml @@ -1,13 +1,35 @@ -{{- if .Values.mimir.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: mimir-to-grafana-cloud-exporter.rules - namespace: {{ .Values.namespace }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} + cluster_type: "management_cluster" + {{- end }} + name: grafana-cloud.rules + namespace: {{ .Values.namespace }} spec: groups: + - name: grafana-cloud + rules: + ## Pages Atlas when prometheus fails to send samples to cortex + - alert: PrometheusMissingGrafanaCloud + annotations: + description: 'Prometheus is not sending data to Grafana Cloud.' + opsrecipe: prometheus-grafanacloud/ + {{- if eq .Values.managementCluster.provider.flavor "capi" }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) + {{- else }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) + {{- end }} + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - name: mimir-to-grafana-cloud-exporter rules: - alert: MimirToGrafanaCloudExporterDown @@ -73,4 +95,4 @@ spec: severity: page team: atlas topic: observability -{{- end }} + {{- end }} diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml index 39fb4a0a0..977840aa1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml @@ -3,9 +3,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: grafana.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml index 6c90a4e2c..7fa5beeb9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml @@ -14,7 +14,7 @@ spec: annotations: description: '{{`KubeStateMetrics is down.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: |- label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1) {{- else }} @@ -85,12 +85,11 @@ spec: severity: page team: atlas topic: observability - - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_configmap_created{}) {{- else }} expr: |- @@ -118,7 +117,7 @@ spec: annotations: description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_daemonset_created{}) {{- else }} expr: |- @@ -146,7 +145,7 @@ spec: annotations: description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_deployment_created{}) {{- else }} expr: |- @@ -174,7 +173,7 @@ spec: annotations: description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_endpoint_created{}) {{- else }} expr: |- @@ -202,7 +201,7 @@ spec: annotations: description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_namespace_created{}) {{- else }} expr: |- @@ -230,7 +229,7 @@ spec: annotations: description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_node_created{}) {{- else }} expr: |- @@ -258,7 +257,7 @@ spec: annotations: description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_pod_created{}) {{- else }} expr: |- @@ -286,7 +285,7 @@ spec: annotations: description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_replicaset_created{}) {{- else }} expr: |- @@ -314,7 +313,7 @@ spec: annotations: description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_secret_created{}) {{- else }} expr: |- @@ -342,7 +341,7 @@ spec: annotations: description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_service_created{}) {{- else }} expr: |- diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml new file mode 100644 index 000000000..c45f70f42 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml @@ -0,0 +1,88 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: logging-pipeline.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: logging-pipeline + rules: + # Any alloy component that uses the loki.write component can throw such errors. + # This includes alloy-logs and the observability-gateway + - alert: LogForwardingErrors + annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview + description: '{{`More that 10% of the requests to Loki are failing.`}}' + opsrecipe: logging-pipeline/ + expr: |- + ( + 100 + * + ( + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) ( + rate ( + loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:] + ) + ) + ) + / + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) ( + rate ( + loki_write_request_duration_seconds_count[5m:] + ) + ) + ) + ) + ) + > 10 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + # This alert pages when the loki source api component of the observability gateway is throwing errors + - alert: LogReceivingErrors + annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview + description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}' + opsrecipe: logging-pipeline/ + expr: |- + ( + 100 + * + ( + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) ( + rate ( + loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push", status_code!~"2.."}[5m:] + ) + ) + ) + / + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) ( + rate ( + loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push"}[5m:] + ) + ) + ) + ) + ) + > 10 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 7f88bd547..a6d96078e 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -61,21 +61,6 @@ spec: severity: page team: atlas topic: observability - - alert: AlloyForPrometheusRulesDown - annotations: - description: 'Alloy sending PrometheusRules to Mimir ruler is down.' - opsrecipe: prometheus-rules/ - expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 - for: 1h - labels: - area: platform - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: MimirRulerEventsFailed annotations: dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml new file mode 100644 index 000000000..e666ea277 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml @@ -0,0 +1,80 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: monitoring-pipeline.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: monitoring-pipeline + rules: + - alert: MetricForwardingErrors + annotations: + description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' + opsrecipe: monitoring-pipeline/ + dashboard: promRW001/prometheus-remote-write + expr: |- + rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 + or rate(prometheus_remote_storage_samples_total[10m]) == 0 + or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0 + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - alert: JobScrapingFailure + annotations: + dashboard: servicemonitors-details/servicemonitors-details + description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}' + summary: Monitoring agent failed to scrape all targets in a job. + opsrecipe: monitoring-job-scraping-failure/ + expr: |- + ( + count(up == 0) by (job, installation, cluster_id, provider, pipeline) + / + count(up) by (job, installation, cluster_id, provider, pipeline) + ) >= 1 + for: 1d + labels: + area: platform + severity: notify + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + - alert: CriticalJobScrapingFailure + annotations: + dashboard: servicemonitors-details/servicemonitors-details + description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}' + summary: Monitoring agent failed to scrape all targets in a job. + opsrecipe: monitoring-job-scraping-failure/ + ## We ignore bastion hosts node exporters + expr: |- + ( + count( + ( + up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"} + or + up{job="kubelet", metrics_path="/metrics"} + ) == 0 + ) by (job, installation, cluster_id, provider, pipeline) + / + count( + up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"} + or + up{job="kubelet", metrics_path="/metrics"} + ) by (job, installation, cluster_id, provider, pipeline) + ) >= 1 + for: 3d + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index af4c7d434..73c749b42 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} name: prometheus-agent.rules @@ -10,14 +9,14 @@ spec: groups: - name: prometheus-agent rules: - ## Page Atlas if prometheus agent fails to send samples to MC prometheus. + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} + ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint. - alert: PrometheusAgentFailing annotations: description: '{{`Prometheus agent remote write is failing.`}}' summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent/ dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} expr: |- max_over_time( sum by (cluster_type, cluster_id, installation, instance, service) @@ -27,20 +26,6 @@ spec: absent(up{instance="prometheus-agent"}) == 1 )[5m:] ) - {{- else }} - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} for: 20m labels: area: platform @@ -59,7 +44,6 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent/ dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} expr: |- max_over_time( sum by (cluster_type, cluster_id, installation, instance, service) @@ -69,20 +53,6 @@ spec: absent(up{instance="prometheus-agent"}) == 1 )[5m:] ) - {{- else }} - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} for: 2m labels: area: platform @@ -93,7 +63,8 @@ spec: cancel_if_cluster_is_not_running_monitoring_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus. + {{- end }} + ## This alert pages if one of the prometheus-agent shard is not running. - alert: PrometheusAgentShardsMissing annotations: description: '{{`Prometheus agent is missing shards.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml index 98865562f..ff81b5e41 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml @@ -9,7 +9,7 @@ spec: groups: - name: observability rules: - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} - alert: "Heartbeat" expr: up{job=~".*prometheus/prometheus.*",instance!="prometheus-agent"} == 1 labels: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index b31713f90..7b48759a8 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -1,7 +1,7 @@ +# TODO(@giantswarm/team-atlas): revisit once vintage is gone apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} name: prometheus.rules @@ -27,36 +27,6 @@ spec: severity: page team: atlas topic: observability - ## Pages Atlas when prometheus fails to send samples to cortex - - alert: PrometheusMissingGrafanaCloud - annotations: - description: 'Prometheus is not sending data to Grafana Cloud.' - opsrecipe: prometheus-grafanacloud/ - {{- if .Values.mimir.enabled }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) - {{- else }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) - {{- end }} - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI - annotations: - description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' - opsrecipe: prometheus-cant-communicate-with-remote-storage-api/ - dashboard: promRW001/prometheus-remote-write - expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0 - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: PrometheusRuleFailures annotations: description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}} @@ -70,48 +40,3 @@ spec: team: atlas topic: observability cancel_if_outside_working_hours: "true" - - alert: PrometheusJobScrapingFailure - annotations: - description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}} - summary: Prometheus fails to scrape all targets in a job. - opsrecipe: prometheus-job-scraping-failure/ - expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1 - for: 1d - labels: - area: platform - severity: notify - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - - alert: PrometheusCriticalJobScrapingFailure - annotations: - description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}} - summary: Prometheus fails to scrape all targets in a job. - opsrecipe: prometheus-job-scraping-failure/ - ## We ignore bastion hosts node exporters - expr: |- - ( - count( - ( - up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} - or - up{job="kubelet", metrics_path="/metrics"} - ) == 0 - ) BY (job, installation, cluster_id, provider, pipeline) - / - count( - up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} - or - up{job="kubelet", metrics_path="/metrics"} - ) BY (job, installation, cluster_id, provider, pipeline) - ) == 1 - for: 3d - labels: - area: platform - severity: page - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml index f48d135ab..422a9c9b1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml @@ -9,16 +9,17 @@ spec: groups: - name: promtail rules: + # This alert lists the existing promtail pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers - alert: PromtailDown annotations: description: '{{`Scraping of all promtail pods to check if one failed every 30 minutes.`}}' opsrecipe: promtail/ expr: |- - # List promtail pods to be able to get the node label and join with the node status to not alert if the node is not ready kube_pod_info{pod=~"promtail.*"} * on(cluster_id, pod) group_left () - up{container="promtail"} == 0 # List promtail containers that are not running + up{container="promtail"} == 0 for: 30m labels: area: platform diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml index 1c546f359..439a96426 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: statefulset.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml index 7b0798d5d..8490e4a79 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: observability.storage.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml index 3ebe08974..20bee678b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml @@ -344,7 +344,7 @@ spec: rules: - expr: sum(ALERTS{alertstate="firing"}) by (alertname, cluster_id, cluster_type, customer, installation, pipeline, provider, region, area, severity, team, topic) record: aggregation:prometheus:alerts - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} # Metric container_memory_working_set_bytes comes from the cAdvisor component scraped on management clusters which is then scraped by the management cluster prometheus. # This means the cluster_id label on this metric will be the cluster_id of the management cluster for all the series, not the workload cluster id. # As we want to record the memory usage of the prometheis per cluster, we need to extract the cluster id from the prometheus pod name (i.e. pod=prometheus-xyz-ordinal => cluster_id=xyz). @@ -353,7 +353,7 @@ spec: - expr: sum(label_replace(container_memory_working_set_bytes{container='prometheus', namespace=~'.*-prometheus'}, "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)")) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) record: aggregation:prometheus:memory_usage {{- end }} - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - name: mimir.grafana-cloud.recording rules: - expr: sum(container_memory_working_set_bytes{namespace='mimir', cluster_type="management_cluster", container=~'.+'}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml index d41a406b5..7d0247b6a 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml index 81a946f09..c402ff83d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml @@ -9,7 +9,7 @@ spec: groups: - name: monitoring.resource-usage-estimation.recording rules: - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_usage_bytes{container="ingester", namespace="mimir"}) by (cluster_id) record: giantswarm:observability:monitoring:resource_usage_estimation:memory_usage_bytes - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_working_set_bytes{container="ingester", namespace="mimir"}) by (cluster_id) diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml index 6ba5a7fa2..d7557af5e 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: external-dns.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml index 33c535c1a..fc7af2fc6 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: app.rules namespace: {{ .Values.namespace }} spec: @@ -21,6 +21,7 @@ spec: for: 30m labels: area: platform + cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -55,6 +56,7 @@ spec: for: 30m labels: area: platform + cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -75,6 +77,7 @@ spec: for: 30m labels: area: platform + cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml index 453478048..2905ee3df 100644 --- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: dex.rules namespace: {{ .Values.namespace }} spec: @@ -41,7 +41,7 @@ spec: annotations: description: '{{`dex-operator did not register a dex-app in giantswarm namespace.`}}' opsrecipe: dex-operator/ - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) {{- else }} expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster"}) == 1 diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml index 61cd126f2..20349e01e 100644 --- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: falco.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/values.schema.json b/helm/prometheus-rules/values.schema.json index 780796c8c..414afa24d 100644 --- a/helm/prometheus-rules/values.schema.json +++ b/helm/prometheus-rules/values.schema.json @@ -30,14 +30,6 @@ } } }, - "mimir": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - } - } - }, "name": { "type": "string" }, diff --git a/helm/prometheus-rules/values.yaml b/helm/prometheus-rules/values.yaml index 409130af9..0388578cf 100644 --- a/helm/prometheus-rules/values.yaml +++ b/helm/prometheus-rules/values.yaml @@ -10,9 +10,6 @@ managementCluster: flavor: "" region: "" -mimir: - enabled: false - Installation: V1: Guest: diff --git a/mimir/update.sh b/mimir/update.sh index 05ddd4bbc..7980f939a 100755 --- a/mimir/update.sh +++ b/mimir/update.sh @@ -36,7 +36,7 @@ spec:\ groups:' "$OUTPUT_FILE" # Add the mimir enabled helm conditional blocks -sed -i '1i{{- if .Values.mimir.enabled }}' "$OUTPUT_FILE" +sed -i '1i{{- if eq .Values.managementCluster.provider.flavor "capi" }}' "$OUTPUT_FILE" sed -i -e '$a{{- end }}' "$OUTPUT_FILE" sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE" diff --git a/test/conf/providers b/test/conf/providers index c22316aea..5425cc445 100644 --- a/test/conf/providers +++ b/test/conf/providers @@ -1,4 +1,3 @@ vintage/aws capi/capz capi/capa -capi/capa-mimir diff --git a/test/hack/bin/run-pint.sh b/test/hack/bin/run-pint.sh index a5aa0150e..84520c5ed 100755 --- a/test/hack/bin/run-pint.sh +++ b/test/hack/bin/run-pint.sh @@ -15,9 +15,11 @@ main () { PINT_CONFIG="${1:-test/conf/pint/pint-config.hcl}" if [[ "${2:-}" != "" ]]; then - mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa-mimir/" | grep -v ".test.yml") + mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa/" | grep -v ".test.yml") + mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capz/" | grep -v ".test.yml") else - mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa-mimir/ -name "*.rules.yml") + mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa/ -name "*.rules.yml") + mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capz/ -name "*.rules.yml") fi test/hack/bin/pint -c "$PINT_CONFIG" lint "${PINT_FILES_LIST[@]}" diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index 57dd769d6..5f9278a97 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -13,7 +13,6 @@ main() { echo "Templating chart for provider: $provider" [[ $provider =~ ([a-z]+)/([a-z]+)([-]*[a-z]*) ]] - [[ "${BASH_REMATCH[3]}" == "-mimir" ]] && mimir_enabled=true || mimir_enabled=false helm template \ "$GIT_WORKDIR"/helm/prometheus-rules \ @@ -21,7 +20,6 @@ main() { --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \ --set="managementCluster.name=myinstall" \ --set="managementCluster.pipeline=stable" \ - --set="mimir.enabled=$mimir_enabled" \ --output-dir "$GIT_WORKDIR"/test/hack/output/helm-chart/"$provider" # Remove useless files for tests diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index 24415136b..d953a5d81 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -2,12 +2,12 @@ module checkLabels go 1.23 -toolchain go1.23.2 +toolchain go1.23.3 require ( // Try to keep version in sync with our prometheus rule CRD version. // see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1 sigs.k8s.io/yaml v1.4.0 ) diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index 9cca146d5..ce45a0cf9 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -557,6 +557,8 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2 h github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2/go.mod h1:D0KY8md81DQKdaR/cXwnhoWB3MYYyc/UjvqE8GFkIvA= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0 h1:b2L36QF60oB8Ty97UOCOnN2VnRbT6eaxzYda9kmk9zE= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0/go.mod h1:SvsRXw4m1F2vk7HquU5h475bFpke27mIUswfyw9u3ug= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1 h1:Fm9Z+FabnB+6EoGq15j+pyLmaK6hYrYOpBlTzOLTQ+E= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1/go.mod h1:SvsRXw4m1F2vk7HquU5h475bFpke27mIUswfyw9u3ug= github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0= github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg= diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml deleted file mode 100644 index 01aebe6cb..000000000 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ /dev/null @@ -1,338 +0,0 @@ ---- -# These tests differ between prometheus and mimir installations: the resulting labels are different -rule_files: -- prometheus-agent.rules.yml - -tests: - # Tests for `PrometheusAgentFailing` alert - - interval: 1m - input_series: - - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}' - values: "_x60 0+0x60 1+0x60" - - series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}' - values: "1+0x180" - alert_rule_test: - - alertname: PrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cluster_id: "gauss" - cluster_type: "workload_cluster" - customer: "giantswarm" - installation: "myinstall" - name: "gauss" - pipeline: "testing" - provider: "capa" - region: "eu-west-2" - status: "True" - type: "ControlPlaneReady" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cluster_id: "gauss" - cluster_type: "workload_cluster" - customer: "giantswarm" - installation: "myinstall" - name: "gauss" - pipeline: "testing" - provider: "capa" - region: "eu-west-2" - status: "True" - type: "ControlPlaneReady" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - customer: "giantswarm" - name: "gauss" - pipeline: "testing" - provider: "capa" - region: "eu-west-2" - status: "True" - type: "ControlPlaneReady" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - customer: "giantswarm" - name: "gauss" - pipeline: "testing" - provider: "capa" - region: "eu-west-2" - status: "True" - type: "ControlPlaneReady" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 150m - - alertname: InhibitionPrometheusAgentFailing - eval_time: 150m - # Tests for `PrometheusAgentShardsMissing` alert - - interval: 1m - input_series: - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '3+0x60 5+0x60 3+0x60' - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '1+0x180' - alert_rule_test: - - alertname: PrometheusAgentShardsMissing - eval_time: 40m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 40m - - alertname: PrometheusAgentShardsMissing - eval_time: 120m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 100m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 130m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 130m - # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - - interval: 1m - input_series: - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '3+0x60 5+0x60 3+0x60' - alert_rule_test: - - alertname: PrometheusAgentShardsMissing - eval_time: 40m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 40m - - alertname: PrometheusAgentShardsMissing - eval_time: 120m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 100m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 130m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 130m diff --git a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml deleted file mode 100644 index 6b130ff88..000000000 --- a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml +++ /dev/null @@ -1,54 +0,0 @@ ---- -rule_files: - - zot.rules.yml - -tests: - - interval: 1m - input_series: - - series: 'kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"}' - values: '_x5 0x10 1x45' - alert_rule_test: - - alertname: ZotDeploymentNotSatisfied - eval_time: 46m - exp_alerts: - - exp_labels: - alertname: "ZotDeploymentNotSatisfied" - area: "platform" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - cluster_type: "management_cluster" - deployment: "zot-zot" - namespace: "zot" - severity: "page" - team: "honeybadger" - topic: "managementcluster" - exp_annotations: - description: "Zot deployment zot/zot-zot is not satisfied." - opsrecipe: "zot/" - - interval: 1m - input_series: - - series: 'kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' - values: '50x30 20x30 15x30 5x60' - - series: 'kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' - values: '100x150' - alert_rule_test: - - alertname: ZotPersistentVolumeFillingUp - eval_time: 150m - exp_alerts: - - exp_labels: - alertname: "ZotPersistentVolumeFillingUp" - area: "platform" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - namespace: "zot" - persistentvolumeclaim: "zot-zot-pvc" - severity: "page" - team: "honeybadger" - topic: "managementcluster" - exp_annotations: - description: "The Zot PersistentVolume claimed by zot-zot-pvc in namespace zot is at least 80% full and projected to fill up soon." - opsrecipe: "zot/" diff --git a/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml b/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml similarity index 100% rename from test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml rename to test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml new file mode 100644 index 000000000..71be1d4c3 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml @@ -0,0 +1,69 @@ +rule_files: + - capi-cluster.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}' + values: "1+0x75" + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}' + values: "1+0x75" + - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="False", type="Ready"}' + values: "0+0x10 0+1x65" + - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="True", type="Ready"}' + values: "0+1x10 0+0x65" + - series: 'capi_cluster_annotation_paused{name="grumpy", exported_namespace="giantswarm", paused_value="true"}' + values: "0+1x75" + alert_rule_test: + - alertname: ClusterUnhealthyPhase + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: managementcluster + name: clippaxy + exported_namespace: giantswarm + phase: Pending + exp_annotations: + description: "Cluster giantswarm/clippaxy stuck in Pending phase." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: ClusterStatusNotReady + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: grumpy + exported_namespace: giantswarm + status: "False" + type: Ready + exp_annotations: + description: "Cluster giantswarm/grumpy is not ready." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: ClusterPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: grumpy + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The cluster giantswarm/grumpy is paused." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml new file mode 100644 index 000000000..2bcb3c23d --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml @@ -0,0 +1,52 @@ +rule_files: + - capi-kubeadmcontrolplane.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+2x100" + - series: 'capi_kubeadmcontrolplane_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: KubeadmControlPlaneReplicasMismatch + eval_time: 100m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jzy + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." + opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: KubeadmControlPlanePaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy kubeadmcontrolplane giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml new file mode 100644 index 000000000..e85606129 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml @@ -0,0 +1,49 @@ +rule_files: + - capi-machine.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' + values: "1+0x10 0+0x35" + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' + values: "0+0x10 1+0x35" + - series: 'capi_machine_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineUnhealthyPhase + eval_time: 45m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jq5 + exported_namespace: giantswarm + phase: Failed + exp_annotations: + description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." + opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachinePaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "Machine giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml new file mode 100644 index 000000000..9d9c1d913 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml @@ -0,0 +1,47 @@ +rule_files: + - capi-machinedeployment.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machinedeployment_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + - series: 'capi_machinedeployment_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineDeploymentIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + phase: Failed + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy." + opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachineDeploymentPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-def99 + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused." + opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml new file mode 100644 index 000000000..70f519087 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml @@ -0,0 +1,47 @@ +rule_files: + - capi-machinepool.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + - series: 'capi_machinepool_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachinePoolIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + phase: Failed + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." + opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachinePoolPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml new file mode 100644 index 000000000..d41639d87 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml @@ -0,0 +1,27 @@ +rule_files: + - capi-machineset.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machineset_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineSetPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-def99 + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "Machineset giantswarm/grumpy-def99 is paused." + opsrecipe: capi-machineset/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml new file mode 100644 index 000000000..c07f91b54 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml @@ -0,0 +1,91 @@ +rule_files: + - capi.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' + values: "1+0x10 0+0x35" + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' + values: "0+0x10 1+0x35" + alert_rule_test: + - alertname: MachineUnhealthyPhase + eval_time: 45m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jq5 + exported_namespace: giantswarm + phase: Failed + exp_annotations: + description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." + - interval: 1m + input_series: + - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + alert_rule_test: + - alertname: MachinePoolIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." + - interval: 1m + input_series: + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+2x100" + alert_rule_test: + - alertname: KubeadmControlPlaneReplicasMismatch + eval_time: 100m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jzy + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." + - interval: 1m + input_series: + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}' + values: "1+0x75" + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}' + values: "1+0x75" + alert_rule_test: + - alertname: ClusterUnhealthyPhase + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: clippaxy + exported_namespace: giantswarm + phase: Pending + exp_annotations: + description: "Cluster giantswarm/clippaxy is in a non healthy phase." diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml new file mode 100644 index 000000000..078f75d79 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml @@ -0,0 +1,94 @@ +--- +rule_files: + - certificate.all.rules.yml + +tests: + # CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: CertificateSecretWillExpireInLessThanTwoWeeks + eval_time: 20d + exp_alerts: + - exp_labels: + alertname: CertificateSecretWillExpireInLessThanTwoWeeks + app: cert-exporter-deployment + area: kaas + cancel_if_outside_working_hours: "true" + cluster_id: gollem + cluster_type: management_cluster + container: cert-exporter + customer: giantswarm + exported_namespace: giantswarm + instance: 10.0.0.0:1234 + job: gollem-prometheus/workload-gollem/0 + namespace: giantswarm + node: 10.0.0.0 + organization: giantswarm + pod: cert-exporter-deployment-5c47b4c55c-49wt9 + provider: aws + name: athena-certs-secret + installation: gollem + service_priority: highest + severity: page + secretkey: tls.crt + team: phoenix + topic: cert-manager + exp_annotations: + description: "Certificate stored in Secret giantswarm/athena-certs-secret on gollem will expire in less than two weeks." + opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" + # CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: CertificateSecretWillExpireInLessThanTwoWeeks + eval_time: 10d + # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + eval_time: 20d + exp_alerts: + - exp_labels: + alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + app: cert-exporter-deployment + area: kaas + cancel_if_outside_working_hours: "true" + cluster_id: 12345 + cluster_type: workload_cluster + container: cert-exporter + customer: giantswarm + exported_namespace: kube-system + instance: 10.0.0.0:1234 + job: 12345-prometheus/workload-12345/0 + namespace: kube-system + node: 10.0.0.0 + organization: giantswarm + pod: cert-exporter-deployment-57bbbfd856-8r8dr + provider: aws + name: kiam-agent + installation: gollem + service_priority: highest + severity: page + team: phoenix + topic: cert-manager + issuer_ref: kiam-ca-issuer + managed_issuer: "true" + exp_annotations: + description: "Certificate CR kube-system/kiam-agent on 12345 will expire in less than two weeks." + opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" + # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + eval_time: 10d diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml new file mode 100644 index 000000000..786acc105 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml @@ -0,0 +1,38 @@ +--- +rule_files: + - node-exporter.rules.yml + +tests: + # NodeExporterCollectorFailed tests + - interval: 1m + input_series: + # No data for 20 minutes, then all good, then cpu collector fails, then bonding collector fails + - series: 'node_scrape_collector_success{app="node-exporter", collector="cpu", instance="10.0.5.111:10300"}' + values: "_x20 1+0x20 0+0x20 1+0x20" + - series: 'node_scrape_collector_success{app="node-exporter", collector="bonding", instance="10.0.5.111:10300"}' + values: "_x20 1+0x20 1+0x20 0+0x20" + alert_rule_test: + - alertname: NodeExporterCollectorFailed + eval_time: 10m + - alertname: NodeExporterCollectorFailed + eval_time: 30m + - alertname: NodeExporterCollectorFailed + eval_time: 50m + exp_alerts: + - exp_labels: + alertname: NodeExporterCollectorFailed + app: "node-exporter" + area: "kaas" + cancel_if_outside_working_hours: "true" + collector: "cpu" + instance: "10.0.5.111:10300" + severity: "page" + team: "phoenix" + topic: "observability" + exp_annotations: + description: "NodeExporter Collector cpu on 10.0.5.111:10300 is failed." + opsrecipe: "node-exporter-device-error/" + - alertname: NodeExporterCollectorFailed + eval_time: 70m + + diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml similarity index 99% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml index ee5645cf0..79c5aa0f1 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml @@ -1,6 +1,6 @@ --- rule_files: -- mimir-to-grafana-cloud-exporter.rules.yml +- grafana-cloud.rules.yml tests: # Tests for `MimirToGrafanaCloudExporterDown` alert diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml similarity index 94% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index 37d40af1d..6bdfeaeab 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -86,35 +86,6 @@ tests: dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview description: "Mimir component : mimir-ingester is down." opsrecipe: "mimir/" - - interval: 1m - input_series: - # test with 1 pod: none, up, down - - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' - values: "_x20 1+0x70 0+0x70" - alert_rule_test: - - alertname: AlloyForPrometheusRulesDown - eval_time: 10m - - alertname: AlloyForPrometheusRulesDown - eval_time: 80m - - alertname: AlloyForPrometheusRulesDown - eval_time: 160m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cluster_id: golem - installation: golem - provider: capa - pipeline: testing - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Alloy sending PrometheusRules to Mimir ruler is down." - opsrecipe: "prometheus-rules/" - interval: 1m input_series: # test: none, rate > 0, rate = 0 diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index f539b2347..bd05e856f 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -4,98 +4,6 @@ rule_files: - prometheus-agent.rules.yml tests: - # Tests for `PrometheusAgentFailing` alert - - interval: 1m - input_series: - - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}' - values: "_x60 0+0x60 1+0x60" - - series: 'capi_cluster_status_condition{cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}' - values: "1+0x180" - alert_rule_test: - - alertname: PrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 150m - - alertname: InhibitionPrometheusAgentFailing - eval_time: 150m # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml new file mode 100644 index 000000000..00167d085 --- /dev/null +++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml @@ -0,0 +1,46 @@ +--- +rule_files: + - cert-manager.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' + values: "0+0x60" + alert_rule_test: + - alertname: CertManagerDown + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: CertManagerDown + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "true" + cluster_id: 12345 + cluster_type: workload_cluster + container: cert-manager + customer: giantswarm + instance: 10.0.0.0:1234 + ip: 10.0.0.0 + job: 12345-prometheus/workload-12345/0 + namespace: kube-system + organization: giantswarm + pod: cert-manager-controller-7fcc585578-gnprd + provider: capa + installation: golem + service_priority: highest + severity: page + team: shield + topic: cert-manager + exp_annotations: + description: "cert-manager in namespace kube-system is down." + opsrecipe: "cert-manager-down/" + - interval: 1m + input_series: + - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' + values: "1+0x60" + alert_rule_test: + - alertname: CertManagerDown + eval_time: 15m diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml new file mode 100644 index 000000000..2ab1f7c20 --- /dev/null +++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml @@ -0,0 +1,59 @@ +--- +rule_files: + - 'teleport.rules.yml' + +tests: + - interval: 1m + input_series: + - series: 'kube_secret_created{cluster_id="my-cluster", installation="golem", secret="my-cluster-teleport-join-token"}' + values: "1+0x150" + - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="golem", phase="Provisioned"}' + values: "2+0x150" + alert_rule_test: + - alertname: TeleportJoinTokenSecretMismatch + eval_time: 30m + exp_alerts: [] + - alertname: TeleportJoinTokenSecretMismatch + eval_time: 140m + exp_alerts: + - exp_labels: + alertname: TeleportJoinTokenSecretMismatch + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: my-cluster + installation: golem + severity: notify + team: shield + topic: teleport + exp_annotations: + description: "Mismatch in number of teleport-join-token secrets and clusters" + - interval: 1m + input_series: + - series: 'kube_configmap_info{cluster_id="my-cluster", installation="grizzly", configmap="my-cluster-teleport-kube-agent-config"}' + values: "1+0x150" + - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="grizzly", phase="Provisioned"}' + values: "2+0x150" + alert_rule_test: + - alertname: TeleportKubeAgentConfigMapMismatch + eval_time: 30m + exp_alerts: [] + - alertname: TeleportKubeAgentConfigMapMismatch + eval_time: 140m + exp_alerts: + - exp_labels: + alertname: TeleportKubeAgentConfigMapMismatch + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: my-cluster + installation: grizzly + severity: notify + team: shield + topic: teleport + exp_annotations: + description: "Mismatch in number of teleport-kube-agent-config secrets and clusters" diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml new file mode 100644 index 000000000..79c5aa0f1 --- /dev/null +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml @@ -0,0 +1,156 @@ +--- +rule_files: +- grafana-cloud.rules.yml + +tests: + # Tests for `MimirToGrafanaCloudExporterDown` alert + - interval: 1m + input_series: + - series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", customer="giantswarm", pipeline="stable", provider="capa", region="eu-west-2"}' + values: "_x60 1+0x60 0+0x60 1+0x60" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 50m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: myinstall + cluster_type: management_cluster + installation: myinstall + job: mimir/mimir-to-grafana-cloud + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" + description: "Prometheus Mimir to Grafana-Cloud is down." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: myinstall + cluster_type: management_cluster + customer: giantswarm + installation: myinstall + job: mimir/mimir-to-grafana-cloud + namespace: mimir + pipeline: stable + provider: capa + region: eu-west-2 + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" + description: "Prometheus Mimir to Grafana-Cloud is down." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 200m + # Tests for `MimirToGrafanaCloudExporterFailures` alert + - interval: 1m + input_series: + # remote read is working for 2 hours and then fails for 1 hour + - series: 'prometheus_remote_storage_read_queries_total{code="200", job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 0+10x60 0+0x60 0+10x180" + # remote write has no failure for 4 hours and then fails for 2 hours + - series: 'prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 0+0x180 0+10x120" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 200m + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 280m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + # Tests for `MimirToGrafanaCloudExporterTooManyRestarts` alert + - interval: 1m + input_series: + # remote read is working for 2 hours and then fails for 1 hour + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea5", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 1+0x60 _x80" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea6", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x122 1+0x2 _x78" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea7", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x124 1+0x2 _x76" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea8", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x126 1+0x2 _x74" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea9", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x128 1+0x72" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + pod: "prometheus-mimir-to-grafana-cloud-0" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is restarting too much." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 180m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml new file mode 100644 index 000000000..6bdfeaeab --- /dev/null +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -0,0 +1,392 @@ +--- +rule_files: + - mimir.rules.yml + +tests: + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: up, none, up, down, up + - series: 'up{job="mimir/ingester", container="ingester"}' + values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30" + alert_rule_test: + - alertname: Heartbeat + eval_time: 20m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - alertname: Heartbeat + eval_time: 70m + - alertname: Heartbeat + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - alertname: Heartbeat + eval_time: 140m + - alertname: Heartbeat + eval_time: 165m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'up{job="mimir/ingester", container="ingester", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}' + values: "_x20 1+0x20 0+0x20" + alert_rule_test: + - alertname: MimirComponentDown + eval_time: 10m + - alertname: MimirComponentDown + eval_time: 30m + - alertname: MimirComponentDown + eval_time: 50m + exp_alerts: + - exp_labels: + service: mimir-ingester + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + exp_annotations: + dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview + description: "Mimir component : mimir-ingester is down." + opsrecipe: "mimir/" + - interval: 1m + input_series: + # test: none, rate > 0, rate = 0 + - series: 'mimir_rules_events_failed_total{cluster_type="management_cluster", cluster_id="golem", installation="golem", namespace="mimir"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirRulerEventsFailed + eval_time: 40m + - alertname: MimirRulerEventsFailed + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + cluster_type: management_cluster + installation: golem + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler + description: "Mimir ruler is failing to process PrometheusRules." + opsrecipe: "mimir/" + - alertname: MimirRulerEventsFailed + eval_time: 160m + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}' + values: "0+5x180" # prometheus container restarts 5 times per minute for 180 minutes + alert_rule_test: + - alertname: MimirRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + - alertname: MimirRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: "true" + area: platform + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + container: mimir-ingester + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview + description: Mimir containers are restarting too often. + opsrecipe: "mimir/" + - alertname: MimirRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + # Test for MimirIngesterNeedsToBeScaledUp alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x400" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x400" + # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+110x70 10400+60x60 14000+110x70 18400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x400" + # mimir-ingester cpu requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x400" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x400" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 85m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 130m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 170m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 210m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 295m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 350m + # Test for MimirIngesterNeedsToBeScaledDown alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + # mimir-ingester real cpu usage gradually increases until it goes below 30% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+10x40 6400+60x60 10000+10x40 10400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+30x300" + # mimir-ingester cpu requests stay the same for the entire duration of the test + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 55m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 100m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 135m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 180m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 240m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming very few resources and needs to be scaled down. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 280m + # Test for MimirHPAReachedMaxReplicas alert + - interval: 1m + input_series: + # HPA max replicas = 3 for the whole test + # HPA target metric = 90% for the whole test + # Cases: + # desired_replicas < max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization > target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization > target_utilization does fire + # desired_replicas > max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization > target_utilization does fire + - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '3+0x360' + - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '2+0x120 3+0x120 4+0x120' + - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '90+0x360' + # HPA current metric = 80% for 10mn, then increase to 90% for 10mn + - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40' + alert_rule_test: + - alertname: MimirHPAReachedMaxReplicas + eval_time: 234m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 235m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-hpa/" + - alertname: MimirHPAReachedMaxReplicas + eval_time: 246m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 360m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-hpa/" + # Test for MimirCompactorFailedCompaction alert + - interval: 1m + input_series: + - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190" + alert_rule_test: + - alertname: MimirCompactorFailedCompaction + eval_time: 15m + - alertname: MimirCompactorFailedCompaction + eval_time: 55m + - alertname: MimirCompactorFailedCompaction + eval_time: 120m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources + description: Mimir compactor has been failing its compactions for 2 hours. + opsrecipe: "mimir#mimircompactorfailedcompaction" + - alertname: MimirCompactorFailedCompaction + eval_time: 205m + - alertname: MimirCompactorFailedCompaction + eval_time: 350m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index e8ec81346..bd05e856f 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -4,98 +4,6 @@ rule_files: - prometheus-agent.rules.yml tests: - # Tests for `PrometheusAgentFailing` alert - - interval: 1m - input_series: - - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}' - values: "_x60 0+0x60 1+0x60" - - series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}' - values: "1+0x180" - alert_rule_test: - - alertname: PrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 150m - - alertname: InhibitionPrometheusAgentFailing - eval_time: 150m # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml new file mode 100644 index 000000000..98549b422 --- /dev/null +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -0,0 +1,422 @@ +--- +rule_files: + - alloy.rules.yml + +tests: + # Test AlloySlowComponentEvaluations + - interval: 1m + input_series: + - series: 'alloy_component_evaluation_slow_seconds{cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller", component_id="comp1"}' + values: "0+0x10 0+1x50 0x50" + alert_rule_test: + - alertname: AlloySlowComponentEvaluations + eval_time: 10m + - alertname: AlloySlowComponentEvaluations + eval_time: 50m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + namespace: default + job: alloy-controller + component_id: comp1 + severity: notify + team: atlas + topic: observability + exp_annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: "Component evaluations are taking too long under job alloy-controller, component_id comp1." + opsrecipe: "alloy/" + summary: "Component evaluations are taking too long." + - alertname: AlloySlowComponentEvaluations + eval_time: 80m + + # Test AlloyUnhealthyComponents + - interval: 1m + input_series: + - series: 'alloy_component_controller_running_components{health_type="unhealthy", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller"}' + values: "0+0x10 1+0x50 0x50" + alert_rule_test: + - alertname: AlloyUnhealthyComponents + eval_time: 10m + - alertname: AlloyUnhealthyComponents + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + namespace: default + job: alloy-controller + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: "Unhealthy components detected under job alloy-controller" + opsrecipe: "alloy/" + summary: "Unhealthy components detected." + - alertname: AlloyUnhealthyComponents + eval_time: 80m + + # Test AlloyForPrometheusRulesDown + - interval: 1m + input_series: + # test with 1 pod: none, up, down + - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}' + values: "_x20 1+0x70 0+0x70" + alert_rule_test: + - alertname: AlloyForPrometheusRulesDown + eval_time: 10m + - alertname: AlloyForPrometheusRulesDown + eval_time: 80m + - alertname: AlloyForPrometheusRulesDown + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + provider: capa + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down." + opsrecipe: "prometheus-rules/" + + # Test LoggingAgentDown + - interval: 1m + input_series: + # For the first 80min: test with 1 pod: none, up, down + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}' + values: "_x20 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + # From 80min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}' + values: "_x80 1+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}' + values: "_x80 0+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + alert_rule_test: + - alertname: LoggingAgentDown + eval_time: 10m + - alertname: LoggingAgentDown + eval_time: 30m + - alertname: LoggingAgentDown + eval_time: 71m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-1.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-1xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + # Tests with 2 pods + - alertname: LoggingAgentDown + eval_time: 111m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - alertname: LoggingAgentDown + eval_time: 121m + - alertname: LoggingAgentDown + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-2.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-2xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + + # Test MonitoringAgentDown + - interval: 1m + input_series: + - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "_x40 1+0x50 0+0x70" + - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "1x150" + alert_rule_test: + - alertname: MonitoringAgentDown + eval_time: 10m + - alertname: InhibitionMonitoringAgentDown + eval_time: 10m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: MonitoringAgentDown + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: InhibitionMonitoringAgentDown + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: MonitoringAgentDown + eval_time: 80m + - alertname: InhibitionMonitoringAgentDown + eval_time: 80m + - alertname: MonitoringAgentDown + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: InhibitionMonitoringAgentDown + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + + # Test MonitoringAgentShardsNotSatisfied + - interval: 1m + input_series: + - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "3+0x10 3+0x90 3+0x50" + - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "3+0x10 2+0x90 3+0x50" + alert_rule_test: + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 10m + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 30m + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 60m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 60m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 130m + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 130m diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml new file mode 100644 index 000000000..fccbfa5a1 --- /dev/null +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml @@ -0,0 +1,111 @@ +--- +rule_files: + - logging-pipeline.rules.yml + +tests: + # Test LogForwardingErrors + - interval: 1m + input_series: + # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones + - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" + - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" + alert_rule_test: + - alertname: LogForwardingErrors + eval_time: 30m + - alertname: LogForwardingErrors + eval_time: 90m + - alertname: LogForwardingErrors + eval_time: 150m + - alertname: LogForwardingErrors + eval_time: 210m + - alertname: LogForwardingErrors + eval_time: 270m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the requests to Loki are failing." + opsrecipe: "logging-pipeline/" + - alertname: LogForwardingErrors + eval_time: 330m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the requests to Loki are failing." + opsrecipe: "logging-pipeline/" + # Test LogReceivingErrors + - interval: 1m + input_series: + # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones + - series: 'loki_source_api_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" + - series: 'loki_source_api_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" + alert_rule_test: + - alertname: LogReceivingErrors + eval_time: 30m + - alertname: LogReceivingErrors + eval_time: 90m + - alertname: LogReceivingErrors + eval_time: 150m + - alertname: LogReceivingErrors + eval_time: 210m + - alertname: LogReceivingErrors + eval_time: 270m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the loki requests to the observability gateway are failing." + opsrecipe: "logging-pipeline/" + - alertname: LogReceivingErrors + eval_time: 330m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the loki requests to the observability gateway are failing." + opsrecipe: "logging-pipeline/" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml similarity index 58% rename from test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml rename to test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml index 77cdd2167..ad97acbb7 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml @@ -1,13 +1,13 @@ --- rule_files: - - prometheus.rules.yml + - monitoring-pipeline.rules.yml # Setting evaluation interval to 1h # to make it faster on long test duration. evaluation_interval: 1h tests: - # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure + # Test JobScrapingFailure and CriticalJobScrapingFailure - interval: 1h input_series: - series: 'up{job="apiserver", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' @@ -30,14 +30,14 @@ tests: - series: 'up{job="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x120 0+0x120" alert_rule_test: - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 30m - - alertname: PrometheusJobScrapingFailure + - alertname: JobScrapingFailure eval_time: 1d - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 4d # This alert fires for both critical and non-critical targets - - alertname: PrometheusJobScrapingFailure + - alertname: JobScrapingFailure eval_time: 7d exp_alerts: - exp_labels: @@ -52,9 +52,10 @@ tests: pipeline: "testing" job: "kube-controller-manager" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job." - exp_labels: area: platform severity: notify @@ -67,12 +68,13 @@ tests: pipeline: "testing" job: "app-exporter" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in app-exporter job." - + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in app-exporter job." + # This fires only for critical target down. - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 9d exp_alerts: - exp_labels: @@ -90,6 +92,30 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job." + + + # Test MetricForwardingErrors + - interval: 1m + input_series: + # remote write has no failure for 1 hour and then fails for 2 hours + - series: 'prometheus_remote_storage_samples_failed_total{url="http://remote-storage_samples_failed_total"}' + values: "0+0x60 0+100x120" + alert_rule_test: + - alertname: MetricForwardingErrors + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + url: "http://remote-storage_samples_failed_total" + exp_annotations: + description: "Monitoring agent can't communicate with Remote Storage API at http://remote-storage_samples_failed_total." + opsrecipe: "monitoring-pipeline/" + dashboard: "promRW001/prometheus-remote-write" diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml index 0c97be969..86aeb0052 100644 --- a/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml +++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml @@ -17,6 +17,7 @@ tests: app: cilium app_version: 1.11.2 area: platform + cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true"