Skip to content

Commit

Permalink
Remove kvm provider (#1058)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Mar 8, 2024
1 parent a8be2b1 commit deb38ee
Show file tree
Hide file tree
Showing 18 changed files with 18 additions and 349 deletions.
9 changes: 5 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Removed

- Remove Azure provider.
- Remove Tiller related alerts.
- Remove GCP alerts and configs.
- Remove Openstack alerts and configs.
= Remove `kvm` provider alerts.
- Remove `azure` provider alerts.
- Remove `tiller` alerts.
- Remove `gcp` provider alerts.
- Remove `openstack` provider alerts.

## [2.153.1] - 2024-02-28

Expand Down
2 changes: 1 addition & 1 deletion helm/prometheus-rules/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ giantswarm.io/service-type: {{ .Values.serviceType }}
{{- end -}}

{{- define "providerTeam" -}}
{{- if has .Values.managementCluster.provider.kind (list "kvm" "cloud-director" "vsphere") -}}
{{- if has .Values.managementCluster.provider.kind (list "cloud-director" "vsphere") -}}
rocket
{{- else if has .Values.managementCluster.provider.kind (list "capa" "capz") -}}
{{- /* hydra alerts merged into phoenix business hours on-call */ -}}
Expand Down
32 changes: 0 additions & 32 deletions helm/prometheus-rules/templates/alerting-rules/calico.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,35 +37,3 @@ spec:
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- if eq .Values.managementCluster.provider.kind "kvm" }}
- alert: CalicoNodeFailingToSaveIptables
annotations:
description: '{{`calico-node {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} is failing to save iptables rules.`}}'
opsrecipe: calico-iptables-failing/
expr: increase(felix_iptables_save_errors[10m]) > 0
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
- alert: CalicoNodeFailingToRestoreIptables
annotations:
description: '{{`calico-node {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} is failing to restore iptables rules.`}}'
opsrecipe: calico-iptables-failing/
expr: increase(felix_iptables_restore_errors[10m]) > 0
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@ spec:
groups:
- name: certificate.management-cluster
rules:
- alert: ManagementClusterKVMCertificateWillExpireInLessThanOneMonth
annotations:
description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than a month.`}}'
opsrecipe: renew-certificates/
expr: (cert_exporter_not_after{cluster_type="management_cluster", provider="kvm", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60
for: 5m
labels:
area: kaas
cancel_if_outside_working_hours: "true"
severity: page
team: rocket
topic: security
- alert: ManagementClusterCertificateIsMissing
annotations:
description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,32 +150,6 @@ spec:
team: phoenix
topic: managementcluster
{{- end }}
{{- if eq .Values.managementCluster.provider.kind "kvm" }}
- alert: KVMManagementClusterDeploymentScaledDownToZero
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} on KVM has been scaled down to zero for prolonged period of time.`}}'
expr: kube_deployment_status_replicas_available{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="kvm"} + kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z],-]*)",provider="kvm"} == 0
for: 4h
labels:
area: kaas
severity: notify
team: rocket
topic: managementcluster
- alert: DeploymentNotSatisfiedRocket
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"worker-.+|master-.+"} > 0
for: 30m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
severity: page
team: rocket
topic: managementcluster
{{- end }}
- alert: DeploymentNotSatisfiedCabbage
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,21 +58,6 @@ spec:
team: {{ include "providerTeam" . }}
topic: etcd-backup
{{- end }}
- alert: ManagementClusterNotBackedUp24h
annotations:
description: '{{`{{ $labels.cluster_id }} management cluster''s ETCD backup was unsuccessful.`}}'
opsrecipe: etcd-backup-failed/
expr: time() - etcd_backup_latest_success{etcd_version="V2",tenant_cluster_id="Control Plane", provider="kvm"} > 60*60*24 or time() - etcd_backup_latest_success{etcd_version="V3",tenant_cluster_id="Control Plane"} > 60*60*24
for: 5m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: {{ include "providerTeam" . }}
topic: etcd-backup
- alert: ETCDBackupMetricsMissing
annotations:
description: '{{`ETCD backup metrics are missing`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,6 @@ spec:
cluster_status_deleting: "true"
team: phoenix
topic: status
{{- if eq .Values.managementCluster.provider.kind "kvm" }}
- alert: InhibitionKVMClusterHasNoRunningCPNodes
annotations:
description: '{{`KVM Cluster {{ $labels.cluster_id }} has nodes down.`}}'
expr: label_replace(up{app=~"master|worker", cluster_type="management_cluster"} == 0, "cluster_id", "$1", "namespace", "(.*)")
labels:
area: kaas
nodes_down: "true"
team: phoenix
topic: status
{{- end }}
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: InhibitionClusterWithNoNodePools
annotations:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if and (ne .Values.managementCluster.provider.kind "kvm") ( eq .Values.managementCluster.provider.flavor "vintage") }}
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"kvm-operator|ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
for: 5m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
annotations:
description: '{{`NodeExporter Collector {{ $labels.collector }} on {{ $labels.instance }} is failed.`}}'
opsrecipe: node-exporter-device-error/
expr: node_scrape_collector_success{collector!~"conntrack|bonding|hwmon|powersupplyclass|mdadm|nfs|nfsd|tapestats|fibrechannel|nvme{{ if eq .Values.managementCluster.provider.kind "kvm" }}|pressure{{ end }}"} == 0
expr: node_scrape_collector_success{collector!~"conntrack|bonding|hwmon|powersupplyclass|mdadm|nfs|nfsd|tapestats|fibrechannel|nvme"} == 0
for: 5m
labels:
area: kaas
Expand Down
Loading

0 comments on commit deb38ee

Please sign in to comment.