Skip to content

Commit

Permalink
Add InhibitionClusterWithoutWorkerNodes for CAPA
Browse files Browse the repository at this point in the history
  • Loading branch information
hervenicol committed Oct 21, 2024
1 parent ebd1682 commit 08fe221
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Added InhibitionClusterWithoutWorkerNodes for CAPA

### Changed

- Modify `KyvernoWebhookHasNoAvailableReplicas` to check specifically for Kyverno resource webhook.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{{- if eq .Values.managementCluster.provider.kind "capa" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
cluster_type: "management_cluster"
name: capa.inhibitions.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: capa.inhibitions
rules:
- alert: InhibitionClusterWithoutWorkerNodes
annotations:
description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
expr: |-
label_replace(
capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
"cluster_id",
"$1",
"name",
"(.*)"
) == 1
unless on (cluster_id) (
sum(capi_machinepool_spec_replicas{} > 0) by (cluster_id)
)
labels:
area: kaas
has_worker_nodes: "false"
team: phoenix
topic: status
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
rule_files:
- capa.inhibition.rules.yml

tests:
# Tests for `InhibitionClusterWithoutWorkerNodes` inhibition alert
- interval: 1m
input_series:
- series: 'capi_cluster_status_condition{cluster_id="golem", cluster_type="management_cluster", name="golem", pipeline="testing", status="True", type="ControlPlaneReady"}'
values: "1+0x300"
- series: 'capi_machinepool_spec_replicas{cluster_id="golem", cluster_name="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", organization="giantswarm", pipeline="testing", provider="capa"}'
values: "_x60 0x60 3x60"
alert_rule_test:
- alertname: InhibitionClusterWithoutWorkerNodes
eval_time: 30m
- alertname: InhibitionClusterWithoutWorkerNodes
eval_time: 90m
- alertname: InhibitionClusterWithoutWorkerNodes
eval_time: 150m
- alertname: PrometheusAgentFailing
eval_time: 30m
exp_alerts:
- exp_labels:
area: platform
severity: page
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
instance: prometheus-agent
cancel_if_cluster_has_no_workers: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent/"
summary: "Prometheus agent fails to send samples to remote write endpoint."

0 comments on commit 08fe221

Please sign in to comment.