From 1d6acde6bbc63b3f767261bf646abc99f788f361 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 24 Aug 2023 16:04:32 +0200 Subject: [PATCH 01/27] Update module github.com/prometheus/alertmanager to v0.26.0 (#891) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- test/hack/checkLabels/go.mod | 18 +++++++++--------- test/hack/checkLabels/go.sum | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index 2f7c71952..8211e986f 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -9,10 +9,10 @@ require ( sigs.k8s.io/yaml v1.3.0 ) -require github.com/prometheus/alertmanager v0.25.0 +require github.com/prometheus/alertmanager v0.26.0 require ( - github.com/aws/aws-sdk-go v1.44.156 // indirect + github.com/aws/aws-sdk-go v1.44.317 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/go-kit/kit v0.10.0 // indirect @@ -26,24 +26,24 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/kr/pretty v0.3.0 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.14.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.38.0 // indirect + github.com/prometheus/client_golang v1.15.1 // indirect + github.com/prometheus/client_model v0.4.0 // indirect + github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/common/sigv4 v0.1.0 // indirect - github.com/prometheus/procfs v0.8.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect golang.org/x/net v0.10.0 // indirect - golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 // indirect + golang.org/x/oauth2 v0.8.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/text v0.10.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.1 // indirect + google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect k8s.io/api v0.27.2 // indirect diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index 8d3c616b8..7d5c8257d 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -67,6 +67,8 @@ github.com/aws/aws-sdk-go v1.34.28/go.mod h1:H7NKnBqNVzoTJpGfLrQkkD+ytBA93eiDYi/ github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/aws/aws-sdk-go v1.44.156 h1:3RhbBTZ87HoI5OP2JjcKdd5qOnyo9YOAW8+Bb/h0vSE= github.com/aws/aws-sdk-go v1.44.156/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.44.317 h1:+8XWrLmGMwPPXSRSLPzhgcGnzJ2mYkgkrcB9C/GnSOU= +github.com/aws/aws-sdk-go v1.44.317/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -391,6 +393,7 @@ github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFB github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= @@ -505,6 +508,8 @@ github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu2 github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg= github.com/prometheus/alertmanager v0.25.0/go.mod h1:MEZ3rFVHqKZsw7IcNS/m4AWZeXThmJhumpiWR4eHU/w= +github.com/prometheus/alertmanager v0.26.0 h1:uOMJWfIwJguc3NaM3appWNbbrh6G/OjvaHMk22aBBYc= +github.com/prometheus/alertmanager v0.26.0/go.mod h1:rVcnARltVjavgVaNnmevxK7kOn7IZavyf0KNgHkbEpU= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= @@ -514,6 +519,8 @@ github.com/prometheus/client_golang v1.10.0/go.mod h1:WJM3cc3yu7XKBKa/I8WeZm+V3e github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= +github.com/prometheus/client_golang v1.15.1 h1:8tXpTmJbyH5lydzFPoxSIJ0J46jdh3tylbvM1xCv0LI= +github.com/prometheus/client_golang v1.15.1/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -522,6 +529,8 @@ github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6T github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/client_model v0.4.0 h1:5lQXD3cAg1OXBf4Wq03gTrXHeaV0TQvGfUooCfx1yqY= +github.com/prometheus/client_model v0.4.0/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= @@ -533,6 +542,8 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9 github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.38.0 h1:VTQitp6mXTdUoCmDMugDVOJ1opi6ADftKfp/yeqTR/E= github.com/prometheus/common v0.38.0/go.mod h1:MBXfmBQZrK5XpbCkjofnXs96LD2QQ7fEq4C0xjC/yec= +github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= +github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI= github.com/prometheus/exporter-toolkit v0.5.1/go.mod h1:OCkM4805mmisBhLmVFw858QYi3v0wKdY6/UxrT0pZVg= @@ -544,6 +555,8 @@ github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4O github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -552,6 +565,7 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -746,6 +760,8 @@ golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4Iltr golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.8.0 h1:6dkIjl3j3LtZ/O3sTgZTMsLKSftL/B8Zgq4huOIIUu8= +golang.org/x/oauth2 v0.8.0/go.mod h1:yr7u4HXZRm1R1kBWqr/xKNqewf0plRYoB7sla+BCIXE= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -1005,6 +1021,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From d6bbc0c91f082aa4ffb2d56f67ba3a11fa32e1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Thu, 24 Aug 2023 17:42:18 +0200 Subject: [PATCH 02/27] add unit tests for KSMDown (#892) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> Co-authored-by: Quentin Bisson --- CHANGELOG.md | 4 + .../templates/alerting-rules/up.all.rules.yml | 2 +- test/conf/promtool_ignore | 1 - .../providers/global/up.all.rules.test.yml | 193 ++++++++++++++++++ 4 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 test/tests/providers/global/up.all.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index c20ce3fab..14d71e272 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Unit tests for KubeStateMetricsDown + ## [2.127.0] - 2023-08-21 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index afd099527..0ab25784b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -72,7 +72,7 @@ spec: cancel_if_cluster_has_no_workers: "true" inhibit_kube_state_metrics_down: "true" cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_outside_working_hours: "false" severity: page team: atlas topic: observability diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 5393cce32..3112fea9d 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -71,7 +71,6 @@ templates/alerting-rules/systemd.workload-cluster.rules.yml templates/alerting-rules/tiller.all.rules.yml templates/alerting-rules/tiller.workload-cluster.rules.yml templates/alerting-rules/timesync.rules.yml -templates/alerting-rules/up.all.rules.yml templates/alerting-rules/up.management-cluster.rules.yml templates/alerting-rules/vault.rules.yml templates/recording-rules/grafana-cloud.rules.yml diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/up.all.rules.test.yml new file mode 100644 index 000000000..88ed88926 --- /dev/null +++ b/test/tests/providers/global/up.all.rules.test.yml @@ -0,0 +1,193 @@ +--- +rule_files: +- up.all.rules.yml + +tests: + # KubeStateMetricsDown tests + # Tests to be run: + # - no "up" metrics + # - "up" metrics with servicemonitor discovery (ports 8080 and 8081) + # - "up" metric for port 8080 is OK, but port 8081 is set to 0 + # - "up" metric for port 8080 is set to 0, but port 8080 is OK + # - "up" metrics with label discovery (random port) + # - "up" is ok, but we don't have enough metrics + - name: "KSMDown with servicemonitor discovery" + interval: 1m + input_series: + # Tests for servicemonitor discovery + # - 00:00 Start with no metrics + # - 00:30 Both ports up and enough metrics + # - 01:00 Port 8080 goes down + # - 01:30 All is up again + # - 02:00 Port 8081 goes down + # - 02:30 all is up again + # - 03:00 we don't have enough metrics + # - 03:30 all is up again + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="testinstall", instance="192.0.2.10:8080", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30 1x30 1x30 1x30 1x30" + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="metrics", installation="testinstall", instance="192.0.2.10:8081", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 1x30 1x30 0x30 1x30 1x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "_x30 1x30 1x30 1x30 1x30 1x30 _x30 1x30" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 Both ports up and enough metrics + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 Port 8080 goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m + # - 02:00 Port 8081 goes down + - alertname: KubeStateMetricsDown + eval_time: 145m + # - 02:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 175m + # - 03:00 we don't have enough metrics + - alertname: KubeStateMetricsDown + eval_time: 205m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 03:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 235m + + + # Tests for label-discovery targets + - name: "KSMDown with label discovery" + interval: 1m + input_series: + # - 00:00 Start with no metrics + # - 00:30 all goes up + # - 01:00 up goes down + # - 01:30 All is up again + - series: 'up{app="kube-state-metrics", cluster_id="testvintage", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", instance="10.0.2.4:10301", job="test-prometheus/workload-test/0", namespace="kube-system", node="ip-10-1-0-3.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-v2-3-0-67b5fdc5d4-78mhf", provider="aws", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric10{app="kube-state-metrics"}' + values: "0x1000" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 all goes up + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 up goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m From 34285df3073ba475f699cbfb42db9448d9a7edef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Thu, 31 Aug 2023 21:25:49 +0200 Subject: [PATCH 03/27] Loki alerts only during working hours (#893) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 4 ++++ .../templates/alerting-rules/loki.all.rules.yml | 4 ++-- test/tests/providers/global/loki.all.rules.test.yml | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14d71e272..820771b70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Unit tests for KubeStateMetricsDown +### Changed + +- Loki alerts only during working hours + ## [2.127.0] - 2023-08-21 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml index e3a198d18..54399dc3f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml @@ -27,7 +27,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability @@ -44,7 +44,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.all.rules.test.yml index 9d4762354..07c55c458 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.all.rules.test.yml @@ -27,7 +27,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki @@ -53,7 +53,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki From 97234c1f99b91d429197fed609c4940feb02c8d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Mon, 4 Sep 2023 13:08:18 +0200 Subject: [PATCH 04/27] PrometheusAgentFailing does not rely on KSM metrics anymore (#897) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 1 + .../alerting-rules/prometheus-agent.rules.yml | 3 +- .../global/prometheus-agent.rules.test.yml | 35 ++++++++++++++----- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 820771b70..98cd41ce0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Loki alerts only during working hours +- `PrometheusAgentFailing` does not rely on KSM metrics anymore ## [2.127.0] - 2023-08-21 diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index 3946e893b..d9414c212 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -18,7 +18,8 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent-remote-write-failed/ dashboard: promRW001/prometheus-remote-write - expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) and count((present_over_time(kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"}[10m]))) + # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) + expr: up{instance="prometheus-agent"} == 0 or absent(up{instance="prometheus-agent"}) == 1 for: 10m labels: area: empowerment diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index d23164296..e065c9bc5 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -5,15 +5,11 @@ rule_files: tests: - interval: 1m input_series: - - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="gauss"}' - values: "_x10 _x20 0+0x100 1+0x100" - - series: 'kube_statefulset_created{namespace="kube-system",statefulset="prometheus-prometheus-agent",cluster_id="gauss",installation="gauss"}' - values: "_x10 0+0x20 1+0x100 1+0x100" + - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="myinstall"}' + values: "_x60 0+0x60 1+0x60" alert_rule_test: - alertname: PrometheusAgentFailing - eval_time: 10m - - alertname: PrometheusAgentFailing - eval_time: 25m + eval_time: 30m exp_alerts: - exp_labels: area: empowerment @@ -21,6 +17,7 @@ tests: team: atlas topic: observability inhibit_prometheus_agent_down: "true" + instance: prometheus-agent cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -31,6 +28,26 @@ tests: opsrecipe: "prometheus-agent-remote-write-failed/" summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 65m + eval_time: 90m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: gauss + cluster_type: workload_cluster + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + installation: myinstall + instance: prometheus-agent + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus agent remote write is failing." + opsrecipe: "prometheus-agent-remote-write-failed/" + summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 165m + eval_time: 150m From 65c906ec2c06d13bc5e0e6d41bb649d3e798d8b7 Mon Sep 17 00:00:00 2001 From: ArchitectBot <61872893+architectbot@users.noreply.github.com> Date: Mon, 4 Sep 2023 14:19:24 +0200 Subject: [PATCH 05/27] Align files (#894) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: github-actions Co-authored-by: Hervé Nicol --- .../workflows/zz_generated.check_values_schema.yaml | 2 +- .github/workflows/zz_generated.create_release.yaml | 12 ++++++------ .../workflows/zz_generated.create_release_pr.yaml | 8 ++++---- .github/workflows/zz_generated.gitleaks.yaml | 2 +- Makefile | 2 +- Makefile.gen.app.mk | 2 +- renovate.json | 3 ++- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/zz_generated.check_values_schema.yaml b/.github/workflows/zz_generated.check_values_schema.yaml index 436f44c35..c22e5de9b 100644 --- a/.github/workflows/zz_generated.check_values_schema.yaml +++ b/.github/workflows/zz_generated.check_values_schema.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # name: 'Values and schema' on: diff --git a/.github/workflows/zz_generated.create_release.yaml b/.github/workflows/zz_generated.create_release.yaml index 5c3f8e903..784c89ec5 100644 --- a/.github/workflows/zz_generated.create_release.yaml +++ b/.github/workflows/zz_generated.create_release.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # name: Create Release on: @@ -15,7 +15,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -24,7 +24,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: project_go_path: ${{ steps.get_project_go_path.outputs.path }} ref_version: ${{ steps.ref_version.outputs.refversion }} @@ -84,7 +84,7 @@ jobs: echo "refversion=${refversion}" >> $GITHUB_OUTPUT update_project_go: name: Update project.go - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 if: ${{ needs.gather_facts.outputs.version != '' && needs.gather_facts.outputs.project_go_path != '' && needs.gather_facts.outputs.ref_version != 'true' }} needs: - gather_facts @@ -146,7 +146,7 @@ jobs: hub pull-request -f -m "${{ env.title }}" -b ${{ env.base }} -h ${{ env.branch }} -r ${{ github.actor }} create_release: name: Create release - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} @@ -194,7 +194,7 @@ jobs: create-release-branch: name: Create release branch - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} diff --git a/.github/workflows/zz_generated.create_release_pr.yaml b/.github/workflows/zz_generated.create_release_pr.yaml index 207cea03f..3dbc0db40 100644 --- a/.github/workflows/zz_generated.create_release_pr.yaml +++ b/.github/workflows/zz_generated.create_release_pr.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # name: Create Release PR on: @@ -30,7 +30,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -39,7 +39,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: repo_name: ${{ steps.gather_facts.outputs.repo_name }} branch: ${{ steps.gather_facts.outputs.branch }} @@ -136,7 +136,7 @@ jobs: fi create_release_pr: name: Create release PR - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.skip != 'true' }} diff --git a/.github/workflows/zz_generated.gitleaks.yaml b/.github/workflows/zz_generated.gitleaks.yaml index 85cb3288a..860559f96 100644 --- a/.github/workflows/zz_generated.gitleaks.yaml +++ b/.github/workflows/zz_generated.gitleaks.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # name: gitleaks diff --git a/Makefile b/Makefile index 2c6db5b13..f307dee41 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # include Makefile.*.mk diff --git a/Makefile.gen.app.mk b/Makefile.gen.app.mk index fbd08071f..a3ff3a0b3 100644 --- a/Makefile.gen.app.mk +++ b/Makefile.gen.app.mk @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.5.0 +# devctl@6.7.0 # ##@ App diff --git a/renovate.json b/renovate.json index f4415e61e..be326a054 100644 --- a/renovate.json +++ b/renovate.json @@ -6,7 +6,8 @@ "dependencyDashboard": true, "ignorePaths": [ ".github/workflows/zz_generated.*", - ".github/workflows/codeql-analysis.yml" + ".github/workflows/codeql-analysis.yml", + ".github/workflows/pre_commit_*.yaml" ], "ignoreDeps": [ "architect", From f34c34dfe3cb80726a9d507d200d7ce365b62847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Tue, 5 Sep 2023 10:12:01 +0200 Subject: [PATCH 06/27] Prometheus-agent inhibition rework, run on the MC (#896) * Prometheus-agent inhibition rework, run on the MC * tests: name install * fix tests broken by named cluster --------- Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 1 + .../inhibit.prometheus-agent.rules.yml | 23 +++++++++++++--- test/hack/bin/template-chart.sh | 1 + .../inhibit.prometheus-agent.rules.test.yml | 27 +++++++++++-------- .../global/silence-operator.rules.test.yml | 1 + 5 files changed, 38 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98cd41ce0..db3120b3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Loki alerts only during working hours - `PrometheusAgentFailing` does not rely on KSM metrics anymore +- Prometheus-agent inhibition rework, run on the MC ## [2.127.0] - 2023-08-21 diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml index 437c09f36..cd9fc2056 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -12,14 +12,29 @@ spec: - name: inhibit.prometheus-agent rules: # this inhibition fires when a cluster is not running prometheus-agent. - # If we have prometheus-agent statefulset, it means prometheus-agent is installed on this cluster - # so, raise an inhibition unless prometheus-agent runs on the cluster + # we retrieve the list of existing cluster IDs from `kube_namespace_created` + # excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name + # then compare it with the list of deployed prometheus-agents from `app_operator_app_info` # - # Will produce data (and inhibitions) on MC/WC. + # Will only produce data (and inhibitions) on MC because it's where app_operator is running + # but that's enough to have the inhibitions on the installation-global alertmanager - alert: InhibitionClusterIsNotRunningPrometheusAgent annotations: description: '{{`Cluster ({{ $labels.cluster_id }}) is not running Prometheus Agent.`}}' - expr: (count by (cluster_id) (prometheus_build_info{app="prometheus"}) unless count by (cluster_id) (kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"} > 0)) + expr: |- + count( + label_replace( + kube_namespace_created{namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".+-prometheus"}, + "cluster_id", "$1", "namespace", "(.+)-prometheus" + ) + ) by (cluster_id) + unless + count( + label_replace( + app_operator_app_info{app="prometheus-agent"}, + "cluster_id", "$1", "namespace", "(.*)" + ) + ) by (cluster_id) labels: cluster_is_not_running_prometheus_agent: "true" area: empowerment diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index a9ab7dda6..ce82ecfd9 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -17,6 +17,7 @@ main() { "$GIT_WORKDIR"/helm/prometheus-rules \ --set="managementCluster.provider.flavor=${BASH_REMATCH[1]}" \ --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \ + --set="managementCluster.name=myinstall" \ --output-dir "$GIT_WORKDIR"/test/hack/output/"$provider" done } diff --git a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml index a3c738166..bc83d6ef2 100644 --- a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml @@ -5,24 +5,29 @@ rule_files: tests: - interval: 1m input_series: - - series: 'prometheus_build_info{app="prometheus",cluster_id="gauss",instance="localhost:9090"}' + # - cluster 1: "clu01" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu01-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 2: "clu02" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu02-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 3: "myinstall", the install name + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="myinstall-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: "1671707388+0x40" + # prometheus-agent app info for "clu01" + - series: 'app_operator_app_info{app="prometheus-agent", app_version="2.40.5", catalog="giantswarm-playground", cluster_id="myinstall", cluster_missing="false", cluster_type="management_cluster", customer="giantswarm", deployed_version="0.1.7", endpoint="web", installation="myinstall", instance="app-exporter", job="app-exporter", name="prometheus-agent", namespace="clu01", node="ip-10-0-5-141.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="app-exporter-6865c9c648-sg5vg", service="app-exporter", status="deployed", team="atlas", upgrade_available="false", version="0.1.7", version_mismatch="false"}' values: "1+0x40" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent"}' - values: "1+0x20 0+0x20" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent-shard-1"}' - values: "1+0x20 0+0x20" alert_rule_test: + #- alertname: InhibitionClusterIsNotRunningPrometheusAgent + # eval_time: 1m - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 1m - - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 22m + eval_time: 10m exp_alerts: - exp_labels: area: empowerment team: atlas topic: monitoring cluster_is_not_running_prometheus_agent: "true" - cluster_id: "gauss" + cluster_id: "clu02" exp_annotations: - description: "Cluster (gauss) is not running Prometheus Agent." - + description: "Cluster (clu02) is not running Prometheus Agent." diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml index f66c2c6f3..f6556027d 100644 --- a/test/tests/providers/global/silence-operator.rules.test.yml +++ b/test/tests/providers/global/silence-operator.rules.test.yml @@ -18,6 +18,7 @@ tests: area: "empowerment" cancel_if_outside_working_hours: "true" controller: silence-controller + installation: "myinstall" severity: "page" team: "atlas" topic: "observability" From cd8956569359ee8a900ac9783d10ca7441ab30a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Tue, 5 Sep 2023 10:51:53 +0200 Subject: [PATCH 07/27] ManagementClusterApp alerts also check default catalog (#898) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 1 + helm/prometheus-rules/templates/alerting-rules/app.rules.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db3120b3e..0e05e8796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Loki alerts only during working hours - `PrometheusAgentFailing` does not rely on KSM metrics anymore - Prometheus-agent inhibition rework, run on the MC +- `ManagementClusterApp` alerts now check for default catalog as well ## [2.127.0] - 2023-08-21 diff --git a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml index 935126daa..842050904 100644 --- a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Management Cluster App {{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}' opsrecipe: app-failed/ - expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"control-plane-.*",team!~"^$|noteam"} + expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"(control-plane-.*|default)",team!~"^$|noteam", namespace=~".*gianstswarm"} for: 30m labels: area: managedservices @@ -30,7 +30,7 @@ spec: annotations: description: 'Current version of {{`App {{ $labels.name }} is {{ $labels.deployed_version }} but it should be {{ $labels.version }}.`}}' opsrecipe: app-pending-update/ - expr: app_operator_app_info{catalog=~"control-plane-.*", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"} + expr: app_operator_app_info{catalog=~"(control-plane-.*|default)", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam", namespace=~".*gianstswarm"} for: 40m labels: area: managedservices From cecd17907eb78b1b0356b2a2fa02e7f0477724d5 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 5 Sep 2023 10:02:25 +0100 Subject: [PATCH 08/27] Release v2.128.0 (#899) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e05e8796..710f56248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.128.0] - 2023-09-05 + ### Added - Unit tests for KubeStateMetricsDown @@ -2142,7 +2144,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...HEAD +[2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 [2.127.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.1...v2.127.0 [2.126.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.0...v2.126.1 [2.126.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.125.0...v2.126.0 From eba652de3db108a689d587045c96655cd954557b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Sun, 10 Sep 2023 20:20:28 +0200 Subject: [PATCH 09/27] Update actions/checkout action to v4 (#900) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- .github/workflows/alert_tests.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/alert_tests.yaml b/.github/workflows/alert_tests.yaml index ce5eb77b6..ecd515b38 100644 --- a/.github/workflows/alert_tests.yaml +++ b/.github/workflows/alert_tests.yaml @@ -7,7 +7,7 @@ jobs: promtool-unit-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run promtool unit tests @@ -15,7 +15,7 @@ jobs: inhibition-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run inhibition tests From 4f89687f235bf235b967b28a8c8dd465a51b0f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Mon, 11 Sep 2023 09:41:17 +0200 Subject: [PATCH 10/27] fix PrometheusAgentShardsMissing (#901) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 5 +++ .../alerting-rules/prometheus-agent.rules.yml | 4 +-- .../global/prometheus-agent.rules.test.yml | 36 +++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 710f56248..814fcda31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Unit tests for `PrometheusAgentShardsMissing` +- fixes for `PrometheusAgentShardsMissing` + ## [2.128.0] - 2023-09-05 ### Added diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index d9414c212..caed5dd04 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -41,7 +41,7 @@ spec: count( ## number of remotes that are not mimir or grafana-cloud prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} - ) != ( + ) != sum( ## number of shards defined in the Prometheus CR prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} or ( @@ -51,7 +51,7 @@ spec: prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} ) ) - for: 30m + for: 10m labels: area: empowerment severity: page diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index e065c9bc5..7f2336445 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -3,6 +3,7 @@ rule_files: - prometheus-agent.rules.yml tests: + # Tests for `PrometheusAgentFailing` alert - interval: 1m input_series: - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="myinstall"}' @@ -51,3 +52,38 @@ tests: summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing eval_time: 150m + # Tests for `PrometheusAgentShardsMissing` alert + - interval: 1m + input_series: + - series: 'up{instance="prometheus-agent", cluster_type="workload_cluster", cluster_id="gauss",installation="myinstall", team="atlas"}' + values: "_x60 0+0x60 1+0x60" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x120" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x120" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x120" + - series: 'prometheus_operator_spec_shards{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '3+0x60 5+0x60' + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '1+0x120' + alert_rule_test: + - alertname: PrometheusAgentShardsMissing + eval_time: 40m + - alertname: PrometheusAgentShardsMissing + eval_time: 100m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." From 2bc9873424d12ac7e02d1b865f11db6ce0841690 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Mon, 11 Sep 2023 12:12:31 +0100 Subject: [PATCH 11/27] Release v2.129.0 (#902) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 814fcda31..a9bb2c7da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.129.0] - 2023-09-11 + ### Changed - Unit tests for `PrometheusAgentShardsMissing` @@ -2149,7 +2151,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...HEAD +[2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 [2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 [2.127.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.1...v2.127.0 [2.126.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.0...v2.126.1 From 5d6fe7c46a3e9fdc610397975f11ad824e8cfaa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C3=ADas=20Charri=C3=A8re?= Date: Tue, 12 Sep 2023 09:23:47 +0200 Subject: [PATCH 12/27] Refactor kyverno_policy_deployment_status_team aggregation metric (#895) * Refactor kyverno_policy_deployment_status_team aggregation metric This refactor removes the need to a thrid query (app_operator_app_info) which usually brings mismatching data (suffix `-app` missing from app labels). In order to do that, uses the `application_giantswarm_io_team` label, which is generally present in our workload. Signed-off-by: Matias Charriere * fix missing comma Signed-off-by: Matias Charriere --------- Signed-off-by: Matias Charriere --- CHANGELOG.md | 4 + .../recording-rules/grafana-cloud.rules.yml | 100 +++++++----------- 2 files changed, 44 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9bb2c7da..e5b001e7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Refactor the Kyverno policy reports recording rule to include missing apps from Team Overview dashboard. + ## [2.129.0] - 2023-09-11 ### Changed diff --git a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml index 365e0e62f..e1bdfc241 100644 --- a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml @@ -318,18 +318,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Deployment" - }, "deployment", ",", "name") - ) by (deployment, category, policy, status) - * on(deployment) group_left(team, app) - sum( - sum( - label_join(kube_deployment_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (deployment, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, deployment, app), + }, "deployment", ",", "name") + ) by (deployment, category, policy, status) + * on(deployment) group_left(team, app) sum( + label_join(label_join(kube_deployment_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, deployment), "name", ",", "deployment") record: aggregation:kyverno_policy_deployment_status_team # Kyverno policy workload status by team - DaemonSets @@ -340,18 +336,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"DaemonSet" - }, "daemonset", ",", "name") - ) by (daemonset, category, policy, status) - * on(daemonset) group_left(team, app) - sum( - sum( - label_join(kube_daemonset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (daemonset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, daemonset, app), + }, "daemonset", ",", "name") + ) by (daemonset, category, policy, status) + * on(daemonset) group_left(team, app) sum( + label_join(label_join(kube_daemonset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, daemonset), "name", ",", "daemonset") record: aggregation:kyverno_policy_daemonset_status_team # Kyverno policy workload status by team - StatefulSets @@ -362,18 +354,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"StatefulSet" - }, "statefulset", ",", "name") - ) by (statefulset, category, policy, status) - * on(statefulset) group_left(team, app) - sum( - sum( - label_join(kube_statefulset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (statefulset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, statefulset, app), + }, "statefulset", ",", "name") + ) by (statefulset, category, policy, status) + * on(statefulset) group_left(team, app) sum( + label_join(label_join(kube_statefulset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, statefulset), "name", ",", "statefulset") record: aggregation:kyverno_policy_statefulset_status_team # Kyverno policy workload status by team - Job @@ -384,18 +372,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Job" - }, "job", ",", "name") - ) by (job, category, policy, status) - * on(job) group_left(team, app) - sum( - sum( - label_join(kube_job_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (job, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, job, app), + }, "job", ",", "name") + ) by (job, category, policy, status) + * on(job) group_left(team, app) sum( + label_join(label_join(kube_job_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, job), "name", ",", "job") record: aggregation:kyverno_policy_job_status_team # Kyverno policy workload status by team - CronJob @@ -406,18 +390,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"CronJob" - }, "cronjob", ",", "name") - ) by (cronjob, category, policy, status) - * on(cronjob) group_left(team, app) - sum( - sum( - label_join(kube_cronjob_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (cronjob, app) - * on(app) group_left(team) - sum( - app_operator_app_info{namespace=~".*giantswarm", team!="noteam"} - ) by (app, team) - ) by (team, cronjob, app), + }, "cronjob", ",", "name") + ) by (cronjob, category, policy, status) + * on(cronjob) group_left(team, app) sum( + label_join(label_join(kube_cronjob_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, cronjob), "name", ",", "cronjob") record: aggregation:kyverno_policy_cronjob_status_team - name: starboard.grafana-cloud.recording From 43b582b3c2ef8f49e1c10b0a806549d1bc8d384e Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Tue, 12 Sep 2023 10:24:30 +0200 Subject: [PATCH 13/27] Change ClusterUnhealthyPhase severity to page (#903) --- CHANGELOG.md | 1 + .../templates/alerting-rules/capi-cluster.rules.yml | 2 +- test/tests/providers/capi/capz/capi-cluster.rules.test.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b001e7d..74f0941ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Refactor the Kyverno policy reports recording rule to include missing apps from Team Overview dashboard. +- Change `ClusterUnhealthyPhase` severity to page, so that we get paged when a cluster is not working properly. ## [2.129.0] - 2023-09-11 diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml index 227767b6d..982844ab1 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml @@ -15,7 +15,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} - severity: notify + severity: page team: {{include "providerTeam" .}} topic: managementcluster annotations: diff --git a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml index b6a970175..ff2c8c1a6 100644 --- a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml @@ -21,7 +21,7 @@ tests: - exp_labels: area: kaas cancel_if_outside_working_hours: "true" - severity: notify + severity: page team: phoenix topic: managementcluster name: clippaxy From 218a66b0b8cb6a8dbe34fbe78b8598cfcf9ab1f7 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 12 Sep 2023 09:29:18 +0100 Subject: [PATCH 14/27] Release v2.130.0 (#904) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74f0941ec..c4c36574a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.130.0] - 2023-09-12 + ### Changed - Refactor the Kyverno policy reports recording rule to include missing apps from Team Overview dashboard. @@ -2156,7 +2158,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...HEAD +[2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 [2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 [2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 [2.127.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.1...v2.127.0 From 40af0000fdcc78def45e02449478b01a40223e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C3=ADas=20Charri=C3=A8re?= Date: Tue, 12 Sep 2023 11:42:21 +0200 Subject: [PATCH 15/27] remove DNSRequestDurationTooSlow alert (#905) Signed-off-by: Matias Charriere --- CHANGELOG.md | 4 ++++ .../templates/alerting-rules/coredns.rules.yml | 14 -------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4c36574a..5364edc76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Remove `DNSRequestDurationTooSlow` in favor of SLO alerting. + ## [2.130.0] - 2023-09-12 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml index 127deaf10..af5454208 100644 --- a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml @@ -35,17 +35,3 @@ spec: topic: dns annotations: description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled to its maximum replica count for too long.`}}' - - alert: DNSRequestDurationTooSlow - expr: histogram_quantile(0.99, sum(irate(coredns_dns_request_duration_seconds_bucket{app="coredns"}[5m])) by (le)) > 1 - for: 15m - labels: - area: empowerment - severity: page - team: cabbage - topic: dns - annotations: - description: '{{`CoreDNS requests are taking more than 1 second to be responded.`}}' - opsrecipe: dns-request-duration-too-slow/ - dashboard: Yu9tkufmk/dns - - From fa81e7d0b6cb3674c25661886bb2d1754769af21 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 12 Sep 2023 10:49:03 +0100 Subject: [PATCH 16/27] Release v2.131.0 (#906) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5364edc76..b76d4adaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.131.0] - 2023-09-12 + ### Changed - Remove `DNSRequestDurationTooSlow` in favor of SLO alerting. @@ -2162,7 +2164,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...HEAD +[2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 [2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 [2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 [2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 From 0b35e79cc28e0152aad4cd8d10db37e126d6bde2 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 14 Sep 2023 09:57:06 +0200 Subject: [PATCH 17/27] Update module github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring to v0.68.0 (#907) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- test/hack/checkLabels/go.mod | 16 ++++++++-------- test/hack/checkLabels/go.sum | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index 8211e986f..5bf987e66 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -5,7 +5,7 @@ go 1.19 require ( // Try to keep version in sync with our prometheus rule CRD version. // see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 sigs.k8s.io/yaml v1.3.0 ) @@ -38,18 +38,18 @@ require ( github.com/prometheus/common/sigv4 v0.1.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect - golang.org/x/net v0.10.0 // indirect + golang.org/x/net v0.15.0 // indirect golang.org/x/oauth2 v0.8.0 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/text v0.10.0 // indirect + golang.org/x/sys v0.12.0 // indirect + golang.org/x/text v0.13.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - k8s.io/api v0.27.2 // indirect - k8s.io/apimachinery v0.27.2 // indirect + k8s.io/api v0.28.1 // indirect + k8s.io/apimachinery v0.28.1 // indirect k8s.io/klog/v2 v2.100.1 // indirect - k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.3.0 // indirect ) diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index 7d5c8257d..4dd053dd3 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -504,6 +504,8 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 h github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1 h1:u1Mw9irznvsBPxQxjUmCel1ufP3UgzA1CILj7/2tpNw= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 h1:yl9ceUSUBo9woQIO+8eoWpcxZkdZgm89g+rVvu37TUw= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0/go.mod h1:9Uuu3pEU2jB8PwuqkHvegQ0HV/BlZRJUyfTYAqfdVF8= github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0= github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg= @@ -752,6 +754,8 @@ golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -843,6 +847,8 @@ golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -867,6 +873,8 @@ golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1068,6 +1076,8 @@ k8s.io/api v0.27.1 h1:Z6zUGQ1Vd10tJ+gHcNNNgkV5emCyW+v2XTmn+CLjSd0= k8s.io/api v0.27.1/go.mod h1:z5g/BpAiD+f6AArpqNjkY+cji8ueZDU/WV1jcj5Jk4E= k8s.io/api v0.27.2 h1:+H17AJpUMvl+clT+BPnKf0E3ksMAzoBBg7CntpSuADo= k8s.io/api v0.27.2/go.mod h1:ENmbocXfBT2ADujUXcBhHV55RIT31IIEvkntP6vZKS4= +k8s.io/api v0.28.1 h1:i+0O8k2NPBCPYaMB+uCkseEbawEt/eFaiRqUx8aB108= +k8s.io/api v0.28.1/go.mod h1:uBYwID+66wiL28Kn2tBjBYQdEU0Xk0z5qF8bIBqk/Dg= k8s.io/apimachinery v0.25.4 h1:CtXsuaitMESSu339tfhVXhQrPET+EiWnIY1rcurKnAc= k8s.io/apimachinery v0.25.4/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= k8s.io/apimachinery v0.26.1 h1:8EZ/eGJL+hY/MYCNwhmDzVqq2lPl3N3Bo8rvweJwXUQ= @@ -1076,6 +1086,8 @@ k8s.io/apimachinery v0.27.1 h1:EGuZiLI95UQQcClhanryclaQE6xjg1Bts6/L3cD7zyc= k8s.io/apimachinery v0.27.1/go.mod h1:5ikh59fK3AJ287GUvpUsryoMFtH9zj/ARfWCo3AyXTM= k8s.io/apimachinery v0.27.2 h1:vBjGaKKieaIreI+oQwELalVG4d8f3YAMNpWLzDXkxeg= k8s.io/apimachinery v0.27.2/go.mod h1:XNfZ6xklnMCOGGFNqXG7bUrQCoR04dh/E7FprV6pb+E= +k8s.io/apimachinery v0.28.1 h1:EJD40og3GizBSV3mkIoXQBsws32okPOy+MkRyzh6nPY= +k8s.io/apimachinery v0.28.1/go.mod h1:X0xh/chESs2hP9koe+SdIAcXWcQ+RM5hy0ZynB+yEvw= k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4= k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/klog/v2 v2.90.0 h1:VkTxIV/FjRXn1fgNNcKGM8cfmL1Z33ZjXRTVxKCoF5M= @@ -1090,6 +1102,8 @@ k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrC k8s.io/utils v0.0.0-20230406110748-d93618cff8a2/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20230505201702-9f6742963106 h1:EObNQ3TW2D+WptiYXlApGNLVy0zm/JIBVY9i+M4wpAU= k8s.io/utils v0.0.0-20230505201702-9f6742963106/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= @@ -1099,6 +1113,8 @@ sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMm sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0 h1:UZbZAZfX0wV2zr7YZorDz6GXROfDFj6LvqCRm4VUVKk= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= From 8f301886d269ede97c2b1763e53817b7ac4b196b Mon Sep 17 00:00:00 2001 From: ArchitectBot <61872893+architectbot@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:50:42 +0200 Subject: [PATCH 18/27] Align files (#908) Co-authored-by: github-actions --- .github/workflows/zz_generated.check_values_schema.yaml | 2 +- .github/workflows/zz_generated.create_release.yaml | 2 +- .github/workflows/zz_generated.create_release_pr.yaml | 2 +- .github/workflows/zz_generated.gitleaks.yaml | 2 +- Makefile | 2 +- Makefile.gen.app.mk | 2 +- renovate.json | 5 +++-- 7 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/zz_generated.check_values_schema.yaml b/.github/workflows/zz_generated.check_values_schema.yaml index c22e5de9b..c450aeeaa 100644 --- a/.github/workflows/zz_generated.check_values_schema.yaml +++ b/.github/workflows/zz_generated.check_values_schema.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # name: 'Values and schema' on: diff --git a/.github/workflows/zz_generated.create_release.yaml b/.github/workflows/zz_generated.create_release.yaml index 784c89ec5..57c5dd5ca 100644 --- a/.github/workflows/zz_generated.create_release.yaml +++ b/.github/workflows/zz_generated.create_release.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # name: Create Release on: diff --git a/.github/workflows/zz_generated.create_release_pr.yaml b/.github/workflows/zz_generated.create_release_pr.yaml index 3dbc0db40..6f07166ea 100644 --- a/.github/workflows/zz_generated.create_release_pr.yaml +++ b/.github/workflows/zz_generated.create_release_pr.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # name: Create Release PR on: diff --git a/.github/workflows/zz_generated.gitleaks.yaml b/.github/workflows/zz_generated.gitleaks.yaml index 860559f96..2c70a482b 100644 --- a/.github/workflows/zz_generated.gitleaks.yaml +++ b/.github/workflows/zz_generated.gitleaks.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # name: gitleaks diff --git a/Makefile b/Makefile index f307dee41..6b6025aaa 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # include Makefile.*.mk diff --git a/Makefile.gen.app.mk b/Makefile.gen.app.mk index a3ff3a0b3..0929f9089 100644 --- a/Makefile.gen.app.mk +++ b/Makefile.gen.app.mk @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.7.0 +# devctl@6.9.0 # ##@ App diff --git a/renovate.json b/renovate.json index be326a054..111e73b0a 100644 --- a/renovate.json +++ b/renovate.json @@ -10,9 +10,10 @@ ".github/workflows/pre_commit_*.yaml" ], "ignoreDeps": [ + "actions/setup-go", "architect", - "zricethezav/gitleaks-action", - "actions/setup-go" + "github.com/imdario/mergo", + "zricethezav/gitleaks-action" ], "regexManagers": [ { From 11e48bdbad69e63c7006c55210f3485e38db8dbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Fri, 15 Sep 2023 10:51:11 +0200 Subject: [PATCH 19/27] keep agent alerts for 5min after it's solved (#909) * keep agent alerts for 5min after it's solved --------- Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> --- CHANGELOG.md | 4 ++ .../alerting-rules/prometheus-agent.rules.yml | 39 ++++++++++++------- .../global/prometheus-agent.rules.test.yml | 31 +++++++++++---- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b76d4adaf..5be1f877f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- `PrometheusAgentFailing` and `PrometheusAgentShardsMissing`: keep alerts for 5min after it's solved + ## [2.131.0] - 2023-09-12 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index caed5dd04..ebeafb6cd 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -19,7 +19,15 @@ spec: opsrecipe: prometheus-agent-remote-write-failed/ dashboard: promRW001/prometheus-remote-write # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) - expr: up{instance="prometheus-agent"} == 0 or absent(up{instance="prometheus-agent"}) == 1 + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) + ( + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) for: 10m labels: area: empowerment @@ -38,19 +46,24 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent-missing-shards/ expr: |- - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} - ) != sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - or ( - # if there is only 1 shard, there is no shard metric so we use the replicas metric - absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) - and on(controller, name) - prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + max_over_time(sum( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} ) - ) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + or + ( + # if there is only 1 shard, there is no shard metric so we use the replicas metric + absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) + and on(controller, name) + prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) + ) + )[5m:]) for: 10m labels: area: empowerment diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index 7f2336445..7db4b646c 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -55,18 +55,16 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'up{instance="prometheus-agent", cluster_type="workload_cluster", cluster_id="gauss",installation="myinstall", team="atlas"}' - values: "_x60 0+0x60 1+0x60" - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x120" + values: "10000+0x180" - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x120" + values: "10000+0x180" - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x120" + values: "10000+0x180" - series: 'prometheus_operator_spec_shards{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '3+0x60 5+0x60' + values: '3+0x60 5+0x60 3+0x60' - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '1+0x120' + values: '1+0x180' alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m @@ -87,3 +85,22 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent-missing-shards/" summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 125m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 130m From 94b8caca76b6f78a1281e84b0f82cc0d2e68ba5b Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Fri, 15 Sep 2023 09:57:29 +0100 Subject: [PATCH 20/27] Release v2.132.0 (#910) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5be1f877f..3fda50a2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.132.0] - 2023-09-15 + ### Changed - `PrometheusAgentFailing` and `PrometheusAgentShardsMissing`: keep alerts for 5min after it's solved @@ -2168,7 +2170,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...HEAD +[2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0 [2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 [2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 [2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 From 737a5e989ff13a86f868eb825ea0e7d71e8842fd Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 19 Sep 2023 11:04:26 +0200 Subject: [PATCH 21/27] Add missing prometheusagentfailing inhibition (#911) --- CHANGELOG.md | 6 +++++- .../alerting-rules/aws.management-cluster.rules.yml | 2 +- .../templates/alerting-rules/up.all.rules.yml | 3 +++ .../templates/alerting-rules/vault.rules.yml | 1 + test/tests/providers/global/up.all.rules.test.yml | 5 +++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fda50a2b..b6d93a9ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert +- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. + ## [2.132.0] - 2023-09-15 ### Changed @@ -168,7 +173,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [2.115.0] - 2023-07-20 - ### Added - New alert `KubeStateMetricsSlow` that inhibits KSM related alerts. diff --git a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml index f25741587..956f72321 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml @@ -149,7 +149,7 @@ spec: description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' opsrecipe: management-cluster-deployment-is-missing/ expr: absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-admission-controller"}) - for: 5m + for: 15m labels: area: kaas cancel_if_prometheus_agent_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index 0ab25784b..e2990b11a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -25,6 +25,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: notify team: honeybadger topic: releng @@ -41,6 +42,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: "true" + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability @@ -73,6 +75,7 @@ spec: inhibit_kube_state_metrics_down: "true" cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml index 13cd2a260..1707c4360 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml @@ -57,6 +57,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: {{ include "providerTeam" . }} topic: vault diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/up.all.rules.test.yml index 88ed88926..ca4d0fbe2 100644 --- a/test/tests/providers/global/up.all.rules.test.yml +++ b/test/tests/providers/global/up.all.rules.test.yml @@ -57,6 +57,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -79,6 +80,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -107,6 +109,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -160,6 +163,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -182,6 +186,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" From 98f70c8f26d1cbc5e7c3169d26a6e660798e3c2f Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 19 Sep 2023 13:59:58 +0200 Subject: [PATCH 22/27] Fix prometheus agent alerts (#913) * Fix Prometheus agent failing alerts Signed-off-by: QuentinBisson * Add outside_working_hours_inhibition back to agent shard alert Signed-off-by: QuentinBisson --------- Signed-off-by: QuentinBisson --- CHANGELOG.md | 6 +++++- .../templates/alerting-rules/prometheus-agent.rules.yml | 1 - test/tests/providers/global/prometheus-agent.rules.test.yml | 2 -- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6d93a9ea..79f9b7dd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert -- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. +- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. + +### Fixed + +- Remove `cancel_if_outside_working_hours` from PrometheusAgent alerts. ## [2.132.0] - 2023-09-15 diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index ebeafb6cd..042e2175a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -38,7 +38,6 @@ spec: cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus. - alert: PrometheusAgentShardsMissing annotations: diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index 7db4b646c..7ed7ff3a0 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -22,7 +22,6 @@ tests: cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus agent remote write is failing." @@ -44,7 +43,6 @@ tests: cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus agent remote write is failing." From 42721e29f55b4b00796d44bb3edb8be94fb5104c Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 19 Sep 2023 16:06:06 +0200 Subject: [PATCH 23/27] Release v2.133.0 (#914) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79f9b7dd4..79037e0df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.133.0] - 2023-09-19 + ### Changed - Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert @@ -2178,7 +2180,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...HEAD +[2.133.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...v2.133.0 [2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0 [2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 [2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 From 47fcf662a233a919271528bece2467cb5ec57058 Mon Sep 17 00:00:00 2001 From: Zirko <64951262+QuantumEnigmaa@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:19:41 +0200 Subject: [PATCH 24/27] split ksm alerts in 2 separate ones (#912) * split ksm alerts in 2 separate ones * move new alert to the adequate file * moved KSMDown alert to the adequate file * fix rules * fix chart * minor fixes * lowered down time to trigger to 20m * changelog --- CHANGELOG.md | 4 ++ .../kube-state-metrics.rules.yml | 49 +++++++++++++++++++ .../templates/alerting-rules/up.all.rules.yml | 33 ------------- ....yml => kube-state-metrics.rules.test.yml} | 2 +- 4 files changed, 54 insertions(+), 34 deletions(-) rename test/tests/providers/global/{up.all.rules.test.yml => kube-state-metrics.rules.test.yml} (99%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79037e0df..66218a9d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` + ## [2.133.0] - 2023-09-19 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 39a2fd571..e635ae988 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -10,6 +10,34 @@ spec: groups: - name: kube-state-metrics rules: + - alert: KubeStateMetricsDown + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + ( + # modern clusters + label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) + ) + and + ( + # vintage clusters without servicemonitor + label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) + ) + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability - alert: KubeStateMetricsSlow annotations: description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}' @@ -28,6 +56,27 @@ spec: severity: page team: atlas topic: observability + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 20m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index e2990b11a..edc29786e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -46,36 +46,3 @@ spec: severity: page team: atlas topic: observability - - alert: KubeStateMetricsDown - annotations: - description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' - opsrecipe: kube-state-metrics-down/ - expr: |- - ( - # modern clusters - label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) - ) - and - ( - # vintage clusters without servicemonitor - label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) - ) - or - ( - # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 - ) - for: 15m - labels: - area: kaas - cancel_if_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - inhibit_kube_state_metrics_down: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: "false" - cancel_if_prometheus_agent_down: "true" - severity: page - team: atlas - topic: observability diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/kube-state-metrics.rules.test.yml similarity index 99% rename from test/tests/providers/global/up.all.rules.test.yml rename to test/tests/providers/global/kube-state-metrics.rules.test.yml index ca4d0fbe2..8f5891193 100644 --- a/test/tests/providers/global/up.all.rules.test.yml +++ b/test/tests/providers/global/kube-state-metrics.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: -- up.all.rules.yml +- kube-state-metrics.rules.yml tests: # KubeStateMetricsDown tests From 302972e851e6257a5e201d4b4b99c7bbeb4155ca Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Thu, 21 Sep 2023 10:45:44 +0200 Subject: [PATCH 25/27] Release v2.134.0 (#915) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66218a9d1..75f90401d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.134.0] - 2023-09-21 + ### Changed - Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` @@ -2184,7 +2186,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.0...HEAD +[2.134.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...v2.134.0 [2.133.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...v2.133.0 [2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0 [2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 From 0bf0707d3ce9c52951e8d2439f0fe803f5c82283 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 26 Sep 2023 13:57:22 +0200 Subject: [PATCH 26/27] Improve PrometheusAgent inhibition to avoid flapping (#916) --- CHANGELOG.md | 4 ++++ .../alerting-rules/inhibit.prometheus-agent.rules.yml | 10 ++++++---- .../alerting-rules/prometheus-meta-operator.rules.yml | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75f90401d..af8655f28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Improve InhibitionClusterIsNotRunningPrometheusAgent to keep paging if the kube-state-metrics metric is missing for 5 minutes (avoid flapping of inhibitions). + ## [2.134.0] - 2023-09-21 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml index cd9fc2056..2fe54e1af 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -24,15 +24,17 @@ spec: expr: |- count( label_replace( - kube_namespace_created{namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".+-prometheus"}, - "cluster_id", "$1", "namespace", "(.+)-prometheus" + sum_over_time( + kube_namespace_created{namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".+-prometheus"}[5m] + ), "cluster_id", "$1", "namespace", "(.+)-prometheus" ) ) by (cluster_id) unless count( label_replace( - app_operator_app_info{app="prometheus-agent"}, - "cluster_id", "$1", "namespace", "(.*)" + sum_over_time( + app_operator_app_info{app="prometheus-agent"}[5m] + ), "cluster_id", "$1", "namespace", "(.*)" ) ) by (cluster_id) labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml index 2d4f73433..446a397c3 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml @@ -68,7 +68,7 @@ spec: area: "empowerment" cancel_if_mc_kube_state_metrics_down: "false" cancel_if_cluster_status_creating: "true" - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: true installation: {{ .Values.managementCluster.name }} severity: "page" team: "atlas" From fe1411dc750160a41cf6a269dadda84fa9d58774 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 26 Sep 2023 08:00:10 -0400 Subject: [PATCH 27/27] Release v2.134.1 (#917) --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af8655f28..6b34ae95b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.134.1] - 2023-09-26 + ### Fixed - Improve InhibitionClusterIsNotRunningPrometheusAgent to keep paging if the kube-state-metrics metric is missing for 5 minutes (avoid flapping of inhibitions). @@ -2190,7 +2192,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.1...HEAD +[2.134.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.0...v2.134.1 [2.134.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...v2.134.0 [2.133.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...v2.133.0 [2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0