diff --git a/.github/workflows/alert_tests.yaml b/.github/workflows/alert_tests.yaml index ce5eb77b6..ecd515b38 100644 --- a/.github/workflows/alert_tests.yaml +++ b/.github/workflows/alert_tests.yaml @@ -7,7 +7,7 @@ jobs: promtool-unit-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run promtool unit tests @@ -15,7 +15,7 @@ jobs: inhibition-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run inhibition tests diff --git a/.github/workflows/zz_generated.check_values_schema.yaml b/.github/workflows/zz_generated.check_values_schema.yaml index 436f44c35..c450aeeaa 100644 --- a/.github/workflows/zz_generated.check_values_schema.yaml +++ b/.github/workflows/zz_generated.check_values_schema.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # name: 'Values and schema' on: diff --git a/.github/workflows/zz_generated.create_release.yaml b/.github/workflows/zz_generated.create_release.yaml index 5c3f8e903..57c5dd5ca 100644 --- a/.github/workflows/zz_generated.create_release.yaml +++ b/.github/workflows/zz_generated.create_release.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # name: Create Release on: @@ -15,7 +15,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -24,7 +24,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: project_go_path: ${{ steps.get_project_go_path.outputs.path }} ref_version: ${{ steps.ref_version.outputs.refversion }} @@ -84,7 +84,7 @@ jobs: echo "refversion=${refversion}" >> $GITHUB_OUTPUT update_project_go: name: Update project.go - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 if: ${{ needs.gather_facts.outputs.version != '' && needs.gather_facts.outputs.project_go_path != '' && needs.gather_facts.outputs.ref_version != 'true' }} needs: - gather_facts @@ -146,7 +146,7 @@ jobs: hub pull-request -f -m "${{ env.title }}" -b ${{ env.base }} -h ${{ env.branch }} -r ${{ github.actor }} create_release: name: Create release - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} @@ -194,7 +194,7 @@ jobs: create-release-branch: name: Create release branch - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} diff --git a/.github/workflows/zz_generated.create_release_pr.yaml b/.github/workflows/zz_generated.create_release_pr.yaml index 207cea03f..6f07166ea 100644 --- a/.github/workflows/zz_generated.create_release_pr.yaml +++ b/.github/workflows/zz_generated.create_release_pr.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # name: Create Release PR on: @@ -30,7 +30,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -39,7 +39,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: repo_name: ${{ steps.gather_facts.outputs.repo_name }} branch: ${{ steps.gather_facts.outputs.branch }} @@ -136,7 +136,7 @@ jobs: fi create_release_pr: name: Create release PR - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.skip != 'true' }} diff --git a/.github/workflows/zz_generated.gitleaks.yaml b/.github/workflows/zz_generated.gitleaks.yaml index 85cb3288a..2c70a482b 100644 --- a/.github/workflows/zz_generated.gitleaks.yaml +++ b/.github/workflows/zz_generated.gitleaks.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # name: gitleaks diff --git a/CHANGELOG.md b/CHANGELOG.md index ef3bf5435..979164628 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,68 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Handover cert-manager alerts to BigMac +## [2.134.1] - 2023-09-26 + +### Fixed + +- Improve InhibitionClusterIsNotRunningPrometheusAgent to keep paging if the kube-state-metrics metric is missing for 5 minutes (avoid flapping of inhibitions). + +## [2.134.0] - 2023-09-21 + +### Changed + +- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` + +## [2.133.0] - 2023-09-19 + +### Changed + +- Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert +- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. + +### Fixed + +- Remove `cancel_if_outside_working_hours` from PrometheusAgent alerts. + +## [2.132.0] - 2023-09-15 + +### Changed + +- `PrometheusAgentFailing` and `PrometheusAgentShardsMissing`: keep alerts for 5min after it's solved + +## [2.131.0] - 2023-09-12 + +### Changed + +- Remove `DNSRequestDurationTooSlow` in favor of SLO alerting. + +## [2.130.0] - 2023-09-12 + +### Changed + +- Refactor the Kyverno policy reports recording rule to include missing apps from Team Overview dashboard. +- Change `ClusterUnhealthyPhase` severity to page, so that we get paged when a cluster is not working properly. + +## [2.129.0] - 2023-09-11 + +### Changed + +- Unit tests for `PrometheusAgentShardsMissing` +- fixes for `PrometheusAgentShardsMissing` + +## [2.128.0] - 2023-09-05 + +### Added + +- Unit tests for KubeStateMetricsDown + +### Changed + +- Loki alerts only during working hours +- `PrometheusAgentFailing` does not rely on KSM metrics anymore +- Prometheus-agent inhibition rework, run on the MC +- `ManagementClusterApp` alerts now check for default catalog as well + ## [2.127.0] - 2023-08-21 ### Changed @@ -133,7 +195,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [2.115.0] - 2023-07-20 - ### Added - New alert `KubeStateMetricsSlow` that inhibits KSM related alerts. @@ -2135,7 +2196,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.1...HEAD +[2.134.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.0...v2.134.1 +[2.134.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...v2.134.0 +[2.133.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...v2.133.0 +[2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0 +[2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 +[2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 +[2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 +[2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 [2.127.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.1...v2.127.0 [2.126.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.0...v2.126.1 [2.126.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.125.0...v2.126.0 diff --git a/Makefile b/Makefile index 2c6db5b13..6b6025aaa 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # include Makefile.*.mk diff --git a/Makefile.gen.app.mk b/Makefile.gen.app.mk index fbd08071f..0929f9089 100644 --- a/Makefile.gen.app.mk +++ b/Makefile.gen.app.mk @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.9.0 # ##@ App diff --git a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml index 935126daa..842050904 100644 --- a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Management Cluster App {{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}' opsrecipe: app-failed/ - expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"control-plane-.*",team!~"^$|noteam"} + expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"(control-plane-.*|default)",team!~"^$|noteam", namespace=~".*gianstswarm"} for: 30m labels: area: managedservices @@ -30,7 +30,7 @@ spec: annotations: description: 'Current version of {{`App {{ $labels.name }} is {{ $labels.deployed_version }} but it should be {{ $labels.version }}.`}}' opsrecipe: app-pending-update/ - expr: app_operator_app_info{catalog=~"control-plane-.*", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"} + expr: app_operator_app_info{catalog=~"(control-plane-.*|default)", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam", namespace=~".*gianstswarm"} for: 40m labels: area: managedservices diff --git a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml index f25741587..956f72321 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml @@ -149,7 +149,7 @@ spec: description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' opsrecipe: management-cluster-deployment-is-missing/ expr: absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-admission-controller"}) - for: 5m + for: 15m labels: area: kaas cancel_if_prometheus_agent_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml index 227767b6d..982844ab1 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml @@ -15,7 +15,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} - severity: notify + severity: page team: {{include "providerTeam" .}} topic: managementcluster annotations: diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml index 127deaf10..af5454208 100644 --- a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml @@ -35,17 +35,3 @@ spec: topic: dns annotations: description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled to its maximum replica count for too long.`}}' - - alert: DNSRequestDurationTooSlow - expr: histogram_quantile(0.99, sum(irate(coredns_dns_request_duration_seconds_bucket{app="coredns"}[5m])) by (le)) > 1 - for: 15m - labels: - area: empowerment - severity: page - team: cabbage - topic: dns - annotations: - description: '{{`CoreDNS requests are taking more than 1 second to be responded.`}}' - opsrecipe: dns-request-duration-too-slow/ - dashboard: Yu9tkufmk/dns - - diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml index 437c09f36..2fe54e1af 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -12,14 +12,31 @@ spec: - name: inhibit.prometheus-agent rules: # this inhibition fires when a cluster is not running prometheus-agent. - # If we have prometheus-agent statefulset, it means prometheus-agent is installed on this cluster - # so, raise an inhibition unless prometheus-agent runs on the cluster + # we retrieve the list of existing cluster IDs from `kube_namespace_created` + # excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name + # then compare it with the list of deployed prometheus-agents from `app_operator_app_info` # - # Will produce data (and inhibitions) on MC/WC. + # Will only produce data (and inhibitions) on MC because it's where app_operator is running + # but that's enough to have the inhibitions on the installation-global alertmanager - alert: InhibitionClusterIsNotRunningPrometheusAgent annotations: description: '{{`Cluster ({{ $labels.cluster_id }}) is not running Prometheus Agent.`}}' - expr: (count by (cluster_id) (prometheus_build_info{app="prometheus"}) unless count by (cluster_id) (kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"} > 0)) + expr: |- + count( + label_replace( + sum_over_time( + kube_namespace_created{namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".+-prometheus"}[5m] + ), "cluster_id", "$1", "namespace", "(.+)-prometheus" + ) + ) by (cluster_id) + unless + count( + label_replace( + sum_over_time( + app_operator_app_info{app="prometheus-agent"}[5m] + ), "cluster_id", "$1", "namespace", "(.*)" + ) + ) by (cluster_id) labels: cluster_is_not_running_prometheus_agent: "true" area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 39a2fd571..e635ae988 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -10,6 +10,34 @@ spec: groups: - name: kube-state-metrics rules: + - alert: KubeStateMetricsDown + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + ( + # modern clusters + label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) + ) + and + ( + # vintage clusters without servicemonitor + label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) + ) + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability - alert: KubeStateMetricsSlow annotations: description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}' @@ -28,6 +56,27 @@ spec: severity: page team: atlas topic: observability + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 20m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml index e3a198d18..54399dc3f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml @@ -27,7 +27,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability @@ -44,7 +44,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index 3946e893b..042e2175a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -18,7 +18,16 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent-remote-write-failed/ dashboard: promRW001/prometheus-remote-write - expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) and count((present_over_time(kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"}[10m]))) + # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) + ( + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) for: 10m labels: area: empowerment @@ -29,7 +38,6 @@ spec: cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus. - alert: PrometheusAgentShardsMissing annotations: @@ -37,20 +45,25 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent-missing-shards/ expr: |- - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} - ) != ( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - or ( - # if there is only 1 shard, there is no shard metric so we use the replicas metric - absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) - and on(controller, name) - prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + max_over_time(sum( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} ) - ) - for: 30m + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + or + ( + # if there is only 1 shard, there is no shard metric so we use the replicas metric + absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) + and on(controller, name) + prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) + ) + )[5m:]) + for: 10m labels: area: empowerment severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml index 2d4f73433..446a397c3 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml @@ -68,7 +68,7 @@ spec: area: "empowerment" cancel_if_mc_kube_state_metrics_down: "false" cancel_if_cluster_status_creating: "true" - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: true installation: {{ .Values.managementCluster.name }} severity: "page" team: "atlas" diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index afd099527..edc29786e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -25,6 +25,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: notify team: honeybadger topic: releng @@ -41,38 +42,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - - alert: KubeStateMetricsDown - annotations: - description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' - opsrecipe: kube-state-metrics-down/ - expr: |- - ( - # modern clusters - label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) - ) - and - ( - # vintage clusters without servicemonitor - label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) - ) - or - ( - # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 - ) - for: 15m - labels: - area: kaas - cancel_if_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - inhibit_kube_state_metrics_down: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml index 13cd2a260..1707c4360 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml @@ -57,6 +57,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: {{ include "providerTeam" . }} topic: vault diff --git a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml index 365e0e62f..e1bdfc241 100644 --- a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml @@ -318,18 +318,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Deployment" - }, "deployment", ",", "name") - ) by (deployment, category, policy, status) - * on(deployment) group_left(team, app) - sum( - sum( - label_join(kube_deployment_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (deployment, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, deployment, app), + }, "deployment", ",", "name") + ) by (deployment, category, policy, status) + * on(deployment) group_left(team, app) sum( + label_join(label_join(kube_deployment_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, deployment), "name", ",", "deployment") record: aggregation:kyverno_policy_deployment_status_team # Kyverno policy workload status by team - DaemonSets @@ -340,18 +336,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"DaemonSet" - }, "daemonset", ",", "name") - ) by (daemonset, category, policy, status) - * on(daemonset) group_left(team, app) - sum( - sum( - label_join(kube_daemonset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (daemonset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, daemonset, app), + }, "daemonset", ",", "name") + ) by (daemonset, category, policy, status) + * on(daemonset) group_left(team, app) sum( + label_join(label_join(kube_daemonset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, daemonset), "name", ",", "daemonset") record: aggregation:kyverno_policy_daemonset_status_team # Kyverno policy workload status by team - StatefulSets @@ -362,18 +354,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"StatefulSet" - }, "statefulset", ",", "name") - ) by (statefulset, category, policy, status) - * on(statefulset) group_left(team, app) - sum( - sum( - label_join(kube_statefulset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (statefulset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, statefulset, app), + }, "statefulset", ",", "name") + ) by (statefulset, category, policy, status) + * on(statefulset) group_left(team, app) sum( + label_join(label_join(kube_statefulset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, statefulset), "name", ",", "statefulset") record: aggregation:kyverno_policy_statefulset_status_team # Kyverno policy workload status by team - Job @@ -384,18 +372,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Job" - }, "job", ",", "name") - ) by (job, category, policy, status) - * on(job) group_left(team, app) - sum( - sum( - label_join(kube_job_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (job, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, job, app), + }, "job", ",", "name") + ) by (job, category, policy, status) + * on(job) group_left(team, app) sum( + label_join(label_join(kube_job_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, job), "name", ",", "job") record: aggregation:kyverno_policy_job_status_team # Kyverno policy workload status by team - CronJob @@ -406,18 +390,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"CronJob" - }, "cronjob", ",", "name") - ) by (cronjob, category, policy, status) - * on(cronjob) group_left(team, app) - sum( - sum( - label_join(kube_cronjob_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (cronjob, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, cronjob, app), + }, "cronjob", ",", "name") + ) by (cronjob, category, policy, status) + * on(cronjob) group_left(team, app) sum( + label_join(label_join(kube_cronjob_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, cronjob), "name", ",", "cronjob") record: aggregation:kyverno_policy_cronjob_status_team - name: starboard.grafana-cloud.recording diff --git a/renovate.json b/renovate.json index f4415e61e..111e73b0a 100644 --- a/renovate.json +++ b/renovate.json @@ -6,12 +6,14 @@ "dependencyDashboard": true, "ignorePaths": [ ".github/workflows/zz_generated.*", - ".github/workflows/codeql-analysis.yml" + ".github/workflows/codeql-analysis.yml", + ".github/workflows/pre_commit_*.yaml" ], "ignoreDeps": [ + "actions/setup-go", "architect", - "zricethezav/gitleaks-action", - "actions/setup-go" + "github.com/imdario/mergo", + "zricethezav/gitleaks-action" ], "regexManagers": [ { diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 5393cce32..3112fea9d 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -71,7 +71,6 @@ templates/alerting-rules/systemd.workload-cluster.rules.yml templates/alerting-rules/tiller.all.rules.yml templates/alerting-rules/tiller.workload-cluster.rules.yml templates/alerting-rules/timesync.rules.yml -templates/alerting-rules/up.all.rules.yml templates/alerting-rules/up.management-cluster.rules.yml templates/alerting-rules/vault.rules.yml templates/recording-rules/grafana-cloud.rules.yml diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index a9ab7dda6..ce82ecfd9 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -17,6 +17,7 @@ main() { "$GIT_WORKDIR"/helm/prometheus-rules \ --set="managementCluster.provider.flavor=${BASH_REMATCH[1]}" \ --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \ + --set="managementCluster.name=myinstall" \ --output-dir "$GIT_WORKDIR"/test/hack/output/"$provider" done } diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index 2f7c71952..5bf987e66 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -5,14 +5,14 @@ go 1.19 require ( // Try to keep version in sync with our prometheus rule CRD version. // see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 sigs.k8s.io/yaml v1.3.0 ) -require github.com/prometheus/alertmanager v0.25.0 +require github.com/prometheus/alertmanager v0.26.0 require ( - github.com/aws/aws-sdk-go v1.44.156 // indirect + github.com/aws/aws-sdk-go v1.44.317 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/go-kit/kit v0.10.0 // indirect @@ -26,30 +26,30 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/kr/pretty v0.3.0 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.14.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.38.0 // indirect + github.com/prometheus/client_golang v1.15.1 // indirect + github.com/prometheus/client_model v0.4.0 // indirect + github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/common/sigv4 v0.1.0 // indirect - github.com/prometheus/procfs v0.8.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect - golang.org/x/net v0.10.0 // indirect - golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/text v0.10.0 // indirect + golang.org/x/net v0.15.0 // indirect + golang.org/x/oauth2 v0.8.0 // indirect + golang.org/x/sys v0.12.0 // indirect + golang.org/x/text v0.13.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.1 // indirect + google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - k8s.io/api v0.27.2 // indirect - k8s.io/apimachinery v0.27.2 // indirect + k8s.io/api v0.28.1 // indirect + k8s.io/apimachinery v0.28.1 // indirect k8s.io/klog/v2 v2.100.1 // indirect - k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.3.0 // indirect ) diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index 8d3c616b8..4dd053dd3 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -67,6 +67,8 @@ github.com/aws/aws-sdk-go v1.34.28/go.mod h1:H7NKnBqNVzoTJpGfLrQkkD+ytBA93eiDYi/ github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/aws/aws-sdk-go v1.44.156 h1:3RhbBTZ87HoI5OP2JjcKdd5qOnyo9YOAW8+Bb/h0vSE= github.com/aws/aws-sdk-go v1.44.156/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.44.317 h1:+8XWrLmGMwPPXSRSLPzhgcGnzJ2mYkgkrcB9C/GnSOU= +github.com/aws/aws-sdk-go v1.44.317/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -391,6 +393,7 @@ github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFB github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= @@ -501,10 +504,14 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 h github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1 h1:u1Mw9irznvsBPxQxjUmCel1ufP3UgzA1CILj7/2tpNw= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 h1:yl9ceUSUBo9woQIO+8eoWpcxZkdZgm89g+rVvu37TUw= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0/go.mod h1:9Uuu3pEU2jB8PwuqkHvegQ0HV/BlZRJUyfTYAqfdVF8= github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0= github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg= github.com/prometheus/alertmanager v0.25.0/go.mod h1:MEZ3rFVHqKZsw7IcNS/m4AWZeXThmJhumpiWR4eHU/w= +github.com/prometheus/alertmanager v0.26.0 h1:uOMJWfIwJguc3NaM3appWNbbrh6G/OjvaHMk22aBBYc= +github.com/prometheus/alertmanager v0.26.0/go.mod h1:rVcnARltVjavgVaNnmevxK7kOn7IZavyf0KNgHkbEpU= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= @@ -514,6 +521,8 @@ github.com/prometheus/client_golang v1.10.0/go.mod h1:WJM3cc3yu7XKBKa/I8WeZm+V3e github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= +github.com/prometheus/client_golang v1.15.1 h1:8tXpTmJbyH5lydzFPoxSIJ0J46jdh3tylbvM1xCv0LI= +github.com/prometheus/client_golang v1.15.1/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -522,6 +531,8 @@ github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6T github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/client_model v0.4.0 h1:5lQXD3cAg1OXBf4Wq03gTrXHeaV0TQvGfUooCfx1yqY= +github.com/prometheus/client_model v0.4.0/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= @@ -533,6 +544,8 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9 github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.38.0 h1:VTQitp6mXTdUoCmDMugDVOJ1opi6ADftKfp/yeqTR/E= github.com/prometheus/common v0.38.0/go.mod h1:MBXfmBQZrK5XpbCkjofnXs96LD2QQ7fEq4C0xjC/yec= +github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= +github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI= github.com/prometheus/exporter-toolkit v0.5.1/go.mod h1:OCkM4805mmisBhLmVFw858QYi3v0wKdY6/UxrT0pZVg= @@ -544,6 +557,8 @@ github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4O github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -552,6 +567,7 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -738,6 +754,8 @@ golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -746,6 +764,8 @@ golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4Iltr golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.8.0 h1:6dkIjl3j3LtZ/O3sTgZTMsLKSftL/B8Zgq4huOIIUu8= +golang.org/x/oauth2 v0.8.0/go.mod h1:yr7u4HXZRm1R1kBWqr/xKNqewf0plRYoB7sla+BCIXE= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -827,6 +847,8 @@ golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -851,6 +873,8 @@ golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1005,6 +1029,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -1050,6 +1076,8 @@ k8s.io/api v0.27.1 h1:Z6zUGQ1Vd10tJ+gHcNNNgkV5emCyW+v2XTmn+CLjSd0= k8s.io/api v0.27.1/go.mod h1:z5g/BpAiD+f6AArpqNjkY+cji8ueZDU/WV1jcj5Jk4E= k8s.io/api v0.27.2 h1:+H17AJpUMvl+clT+BPnKf0E3ksMAzoBBg7CntpSuADo= k8s.io/api v0.27.2/go.mod h1:ENmbocXfBT2ADujUXcBhHV55RIT31IIEvkntP6vZKS4= +k8s.io/api v0.28.1 h1:i+0O8k2NPBCPYaMB+uCkseEbawEt/eFaiRqUx8aB108= +k8s.io/api v0.28.1/go.mod h1:uBYwID+66wiL28Kn2tBjBYQdEU0Xk0z5qF8bIBqk/Dg= k8s.io/apimachinery v0.25.4 h1:CtXsuaitMESSu339tfhVXhQrPET+EiWnIY1rcurKnAc= k8s.io/apimachinery v0.25.4/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= k8s.io/apimachinery v0.26.1 h1:8EZ/eGJL+hY/MYCNwhmDzVqq2lPl3N3Bo8rvweJwXUQ= @@ -1058,6 +1086,8 @@ k8s.io/apimachinery v0.27.1 h1:EGuZiLI95UQQcClhanryclaQE6xjg1Bts6/L3cD7zyc= k8s.io/apimachinery v0.27.1/go.mod h1:5ikh59fK3AJ287GUvpUsryoMFtH9zj/ARfWCo3AyXTM= k8s.io/apimachinery v0.27.2 h1:vBjGaKKieaIreI+oQwELalVG4d8f3YAMNpWLzDXkxeg= k8s.io/apimachinery v0.27.2/go.mod h1:XNfZ6xklnMCOGGFNqXG7bUrQCoR04dh/E7FprV6pb+E= +k8s.io/apimachinery v0.28.1 h1:EJD40og3GizBSV3mkIoXQBsws32okPOy+MkRyzh6nPY= +k8s.io/apimachinery v0.28.1/go.mod h1:X0xh/chESs2hP9koe+SdIAcXWcQ+RM5hy0ZynB+yEvw= k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4= k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/klog/v2 v2.90.0 h1:VkTxIV/FjRXn1fgNNcKGM8cfmL1Z33ZjXRTVxKCoF5M= @@ -1072,6 +1102,8 @@ k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrC k8s.io/utils v0.0.0-20230406110748-d93618cff8a2/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20230505201702-9f6742963106 h1:EObNQ3TW2D+WptiYXlApGNLVy0zm/JIBVY9i+M4wpAU= k8s.io/utils v0.0.0-20230505201702-9f6742963106/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= @@ -1081,6 +1113,8 @@ sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMm sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0 h1:UZbZAZfX0wV2zr7YZorDz6GXROfDFj6LvqCRm4VUVKk= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml index b6a970175..ff2c8c1a6 100644 --- a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml @@ -21,7 +21,7 @@ tests: - exp_labels: area: kaas cancel_if_outside_working_hours: "true" - severity: notify + severity: page team: phoenix topic: managementcluster name: clippaxy diff --git a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml index a3c738166..bc83d6ef2 100644 --- a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml @@ -5,24 +5,29 @@ rule_files: tests: - interval: 1m input_series: - - series: 'prometheus_build_info{app="prometheus",cluster_id="gauss",instance="localhost:9090"}' + # - cluster 1: "clu01" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu01-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 2: "clu02" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu02-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 3: "myinstall", the install name + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="myinstall-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: "1671707388+0x40" + # prometheus-agent app info for "clu01" + - series: 'app_operator_app_info{app="prometheus-agent", app_version="2.40.5", catalog="giantswarm-playground", cluster_id="myinstall", cluster_missing="false", cluster_type="management_cluster", customer="giantswarm", deployed_version="0.1.7", endpoint="web", installation="myinstall", instance="app-exporter", job="app-exporter", name="prometheus-agent", namespace="clu01", node="ip-10-0-5-141.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="app-exporter-6865c9c648-sg5vg", service="app-exporter", status="deployed", team="atlas", upgrade_available="false", version="0.1.7", version_mismatch="false"}' values: "1+0x40" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent"}' - values: "1+0x20 0+0x20" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent-shard-1"}' - values: "1+0x20 0+0x20" alert_rule_test: + #- alertname: InhibitionClusterIsNotRunningPrometheusAgent + # eval_time: 1m - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 1m - - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 22m + eval_time: 10m exp_alerts: - exp_labels: area: empowerment team: atlas topic: monitoring cluster_is_not_running_prometheus_agent: "true" - cluster_id: "gauss" + cluster_id: "clu02" exp_annotations: - description: "Cluster (gauss) is not running Prometheus Agent." - + description: "Cluster (clu02) is not running Prometheus Agent." diff --git a/test/tests/providers/global/kube-state-metrics.rules.test.yml b/test/tests/providers/global/kube-state-metrics.rules.test.yml new file mode 100644 index 000000000..8f5891193 --- /dev/null +++ b/test/tests/providers/global/kube-state-metrics.rules.test.yml @@ -0,0 +1,198 @@ +--- +rule_files: +- kube-state-metrics.rules.yml + +tests: + # KubeStateMetricsDown tests + # Tests to be run: + # - no "up" metrics + # - "up" metrics with servicemonitor discovery (ports 8080 and 8081) + # - "up" metric for port 8080 is OK, but port 8081 is set to 0 + # - "up" metric for port 8080 is set to 0, but port 8080 is OK + # - "up" metrics with label discovery (random port) + # - "up" is ok, but we don't have enough metrics + - name: "KSMDown with servicemonitor discovery" + interval: 1m + input_series: + # Tests for servicemonitor discovery + # - 00:00 Start with no metrics + # - 00:30 Both ports up and enough metrics + # - 01:00 Port 8080 goes down + # - 01:30 All is up again + # - 02:00 Port 8081 goes down + # - 02:30 all is up again + # - 03:00 we don't have enough metrics + # - 03:30 all is up again + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="testinstall", instance="192.0.2.10:8080", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30 1x30 1x30 1x30 1x30" + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="metrics", installation="testinstall", instance="192.0.2.10:8081", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 1x30 1x30 0x30 1x30 1x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "_x30 1x30 1x30 1x30 1x30 1x30 _x30 1x30" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 Both ports up and enough metrics + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 Port 8080 goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m + # - 02:00 Port 8081 goes down + - alertname: KubeStateMetricsDown + eval_time: 145m + # - 02:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 175m + # - 03:00 we don't have enough metrics + - alertname: KubeStateMetricsDown + eval_time: 205m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 03:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 235m + + + # Tests for label-discovery targets + - name: "KSMDown with label discovery" + interval: 1m + input_series: + # - 00:00 Start with no metrics + # - 00:30 all goes up + # - 01:00 up goes down + # - 01:30 All is up again + - series: 'up{app="kube-state-metrics", cluster_id="testvintage", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", instance="10.0.2.4:10301", job="test-prometheus/workload-test/0", namespace="kube-system", node="ip-10-1-0-3.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-v2-3-0-67b5fdc5d4-78mhf", provider="aws", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric10{app="kube-state-metrics"}' + values: "0x1000" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 all goes up + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 up goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.all.rules.test.yml index 9d4762354..07c55c458 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.all.rules.test.yml @@ -27,7 +27,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki @@ -53,7 +53,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index d23164296..7ed7ff3a0 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -3,17 +3,14 @@ rule_files: - prometheus-agent.rules.yml tests: + # Tests for `PrometheusAgentFailing` alert - interval: 1m input_series: - - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="gauss"}' - values: "_x10 _x20 0+0x100 1+0x100" - - series: 'kube_statefulset_created{namespace="kube-system",statefulset="prometheus-prometheus-agent",cluster_id="gauss",installation="gauss"}' - values: "_x10 0+0x20 1+0x100 1+0x100" + - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="myinstall"}' + values: "_x60 0+0x60 1+0x60" alert_rule_test: - alertname: PrometheusAgentFailing - eval_time: 10m - - alertname: PrometheusAgentFailing - eval_time: 25m + eval_time: 30m exp_alerts: - exp_labels: area: empowerment @@ -21,16 +18,87 @@ tests: team: atlas topic: observability inhibit_prometheus_agent_down: "true" + instance: prometheus-agent cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus agent remote write is failing." opsrecipe: "prometheus-agent-remote-write-failed/" summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 65m + eval_time: 90m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: gauss + cluster_type: workload_cluster + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + installation: myinstall + instance: prometheus-agent + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus agent remote write is failing." + opsrecipe: "prometheus-agent-remote-write-failed/" + summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 165m + eval_time: 150m + # Tests for `PrometheusAgentShardsMissing` alert + - interval: 1m + input_series: + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_operator_spec_shards{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '3+0x60 5+0x60 3+0x60' + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '1+0x180' + alert_rule_test: + - alertname: PrometheusAgentShardsMissing + eval_time: 40m + - alertname: PrometheusAgentShardsMissing + eval_time: 100m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 125m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 130m diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml index f66c2c6f3..f6556027d 100644 --- a/test/tests/providers/global/silence-operator.rules.test.yml +++ b/test/tests/providers/global/silence-operator.rules.test.yml @@ -18,6 +18,7 @@ tests: area: "empowerment" cancel_if_outside_working_hours: "true" controller: silence-controller + installation: "myinstall" severity: "page" team: "atlas" topic: "observability"