From 356738a9462d4d627e88ac0bea47d1a16337d047 Mon Sep 17 00:00:00 2001 From: Smaine Kahlouch Date: Fri, 1 Nov 2024 17:21:45 +0100 Subject: [PATCH] chore(oncall): replace rabbitmq cr with a helmrelease --- .../platform-private-gateway-certificate.yaml | 2 + .../base/gapi/platform-private-gateway.yaml | 2 +- .../externalsecret-rabbitmq.yaml | 22 +--- .../grafana-oncall/helmrelease-oncall.yaml | 4 +- .../grafana-oncall/helmrelease-rabbitmq.yaml | 117 ++++++++++++++++++ .../base/grafana-oncall/httproute-oncall.yaml | 15 +++ .../grafana-oncall/httproute-rabbitmq.yaml | 15 +++ .../base/grafana-oncall/kustomization.yaml | 4 +- .../base/grafana-oncall/rabbitmq.yaml | 9 -- .../cert-manager/vault-clusterissuer.yaml | 2 +- 10 files changed, 160 insertions(+), 32 deletions(-) create mode 100644 observability/base/grafana-oncall/helmrelease-rabbitmq.yaml create mode 100644 observability/base/grafana-oncall/httproute-oncall.yaml create mode 100644 observability/base/grafana-oncall/httproute-rabbitmq.yaml delete mode 100644 observability/base/grafana-oncall/rabbitmq.yaml diff --git a/infrastructure/base/gapi/platform-private-gateway-certificate.yaml b/infrastructure/base/gapi/platform-private-gateway-certificate.yaml index 5940fe04..0cc8d4de 100644 --- a/infrastructure/base/gapi/platform-private-gateway-certificate.yaml +++ b/infrastructure/base/gapi/platform-private-gateway-certificate.yaml @@ -15,6 +15,8 @@ spec: - hubble-${cluster_name}.priv.${domain_name} - vmalertmanager-${cluster_name}.priv.${domain_name} - vmagent-${cluster_name}.priv.${domain_name} + - oncall.priv.${domain_name} + - oncall-rabbitmq.priv.${domain_name} issuerRef: name: vault kind: ClusterIssuer diff --git a/infrastructure/base/gapi/platform-private-gateway.yaml b/infrastructure/base/gapi/platform-private-gateway.yaml index 159ace78..3ac51e66 100644 --- a/infrastructure/base/gapi/platform-private-gateway.yaml +++ b/infrastructure/base/gapi/platform-private-gateway.yaml @@ -10,7 +10,7 @@ spec: service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: instance service.beta.kubernetes.io/aws-load-balancer-scheme: "internal" service.beta.kubernetes.io/aws-load-balancer-type: "external" - external-dns.alpha.kubernetes.io/hostname: "harbor.priv.${domain_name},grafana.priv.${domain_name},vm.priv.${domain_name},headlamp.priv.${domain_name},hubble-${cluster_name}.priv.${domain_name},vmalertmanager-${cluster_name}.priv.${domain_name},vmagent-${cluster_name}.priv.${domain_name}" + external-dns.alpha.kubernetes.io/hostname: "harbor.priv.${domain_name},grafana.priv.${domain_name},vm.priv.${domain_name},headlamp.priv.${domain_name},hubble-${cluster_name}.priv.${domain_name},vmalertmanager-${cluster_name}.priv.${domain_name},vmagent-${cluster_name}.priv.${domain_name},oncall.priv.${domain_name},oncall-rabbitmq.priv.${domain_name},oncall-rabbitmq.priv.${domain_name}" listeners: - name: http hostname: "*.priv.${domain_name}" diff --git a/observability/base/grafana-oncall/externalsecret-rabbitmq.yaml b/observability/base/grafana-oncall/externalsecret-rabbitmq.yaml index f75efb58..817eb6f2 100644 --- a/observability/base/grafana-oncall/externalsecret-rabbitmq.yaml +++ b/observability/base/grafana-oncall/externalsecret-rabbitmq.yaml @@ -8,24 +8,10 @@ spec: name: clustersecretstore refreshInterval: 1h target: - name: oncall-rabbitmq creationPolicy: Owner deletionPolicy: Retain - template: - engineVersion: v2 - type: Opaque - data: # We can create kubernetes secrets with the desired format. - default_user.conf: | - default_user = "{{ .username }}" - default_pass = "{{ .password }}" - username: "{{ .username }}" - password: "{{ .password }}" - data: - - secretKey: username - remoteRef: - key: observability/grafana/oncall-rabbitmq - property: username - - secretKey: password - remoteRef: + name: oncall-rabbitmq + dataFrom: + - extract: + conversionStrategy: Default key: observability/grafana/oncall-rabbitmq - property: password diff --git a/observability/base/grafana-oncall/helmrelease-oncall.yaml b/observability/base/grafana-oncall/helmrelease-oncall.yaml index 625fc162..cb18cc8f 100644 --- a/observability/base/grafana-oncall/helmrelease-oncall.yaml +++ b/observability/base/grafana-oncall/helmrelease-oncall.yaml @@ -51,7 +51,7 @@ spec: signingSecretKey: "signing_secret" ingress: - enabled: false + enabled: true ingress-nginx: enabled: false @@ -91,7 +91,7 @@ spec: externalRedis: host: oncall-valkey-master port: 6379 - username: user + username: default existingSecret: "oncall-valkey" passwordKey: password diff --git a/observability/base/grafana-oncall/helmrelease-rabbitmq.yaml b/observability/base/grafana-oncall/helmrelease-rabbitmq.yaml new file mode 100644 index 00000000..72d2b0b5 --- /dev/null +++ b/observability/base/grafana-oncall/helmrelease-rabbitmq.yaml @@ -0,0 +1,117 @@ +# Based on https://grafana.com/docs/grafana-oncall/latest/setup/install/helm/install-scalable/ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: oncall-rabbitmq +spec: + releaseName: oncall-rabbitmq + driftDetection: + mode: enabled + chart: + spec: + chart: rabbitmq + sourceRef: + kind: HelmRepository + name: bitnami + namespace: flux-system + version: "15.0.3" + interval: 5m0s + timeout: 15m + install: + remediation: + retries: 3 + values: + auth: + username: oncall + existingPasswordSecret: "oncall-rabbitmq" + existingSecretPasswordKey: "password" + existingErlangSecret: "oncall-rabbitmq" + existingSecretErlangKey: "erlang-cookie-secret" + + # Todo: enable TLS + tls: + enabled: false + + replicaCount: 1 + + resourcesPreset: "nano" + + persistence: + storageClass: "gp3" + size: 8Gi + + ingress: + ## To be replaced with gapi + enabled: true + + networkPolicy: + ## To be replaced with cilium network policy + enabled: true + + metrics: + enabled: true + serviceMonitor: + default: + enabled: true + perObject: + enabled: true + detailed: + enabled: true + + prometheusRule: + enabled: true + namespace: "observability" + rules: + - alert: RabbitmqDown + expr: rabbitmq_up{service="{{ template "common.names.fullname" . }}"} == 0 + for: 5m + labels: + severity: error + annotations: + summary: Rabbitmq down (instance {{ "{{ $labels.instance }}" }}) + description: RabbitMQ node down + - alert: ClusterDown + expr: | + sum(rabbitmq_running{service="{{ template "common.names.fullname" . }}"}) + < {{ .Values.replicaCount }} + for: 5m + labels: + severity: error + annotations: + summary: Cluster down (instance {{ "{{ $labels.instance }}" }}) + description: | + Less than {{ .Values.replicaCount }} nodes running in RabbitMQ cluster + VALUE = {{ "{{ $value }}" }} + - alert: ClusterPartition + expr: rabbitmq_partitions{service="{{ template "common.names.fullname" . }}"} > 0 + for: 5m + labels: + severity: error + annotations: + summary: Cluster partition (instance {{ "{{ $labels.instance }}" }}) + description: | + Cluster partition + VALUE = {{ "{{ $value }}" }} + - alert: OutOfMemory + expr: | + rabbitmq_node_mem_used{service="{{ template "common.names.fullname" . }}"} + / rabbitmq_node_mem_limit{service="{{ template "common.names.fullname" . }}"} + * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: Out of memory (instance {{ "{{ $labels.instance }}" }}) + description: | + Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ "{{ $value }}" }} + LABELS: {{ "{{ $labels }}" }} + - alert: TooManyConnections + expr: rabbitmq_connectionsTotal{service="{{ template "common.names.fullname" . }}"} > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: Too many connections (instance {{ "{{ $labels.instance }}" }}) + description: | + RabbitMQ instance has too many connections (> 1000) + VALUE = {{ "{{ $value }}" }}\n LABELS: {{ "{{ $labels }}" }} diff --git a/observability/base/grafana-oncall/httproute-oncall.yaml b/observability/base/grafana-oncall/httproute-oncall.yaml new file mode 100644 index 00000000..1db70941 --- /dev/null +++ b/observability/base/grafana-oncall/httproute-oncall.yaml @@ -0,0 +1,15 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: oncall + namespace: observability +spec: + parentRefs: + - name: platform-private + namespace: infrastructure + hostnames: + - "oncall.priv.${domain_name}" + rules: + - backendRefs: + - name: oncall-engine + port: 8080 diff --git a/observability/base/grafana-oncall/httproute-rabbitmq.yaml b/observability/base/grafana-oncall/httproute-rabbitmq.yaml new file mode 100644 index 00000000..1e263ca8 --- /dev/null +++ b/observability/base/grafana-oncall/httproute-rabbitmq.yaml @@ -0,0 +1,15 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: oncall-rabbitmq + namespace: observability +spec: + parentRefs: + - name: platform-private + namespace: infrastructure + hostnames: + - "oncall-rabbitmq.priv.${domain_name}" + rules: + - backendRefs: + - name: oncall-rabbitmq + port: 15672 diff --git a/observability/base/grafana-oncall/kustomization.yaml b/observability/base/grafana-oncall/kustomization.yaml index b68177d7..c637c729 100644 --- a/observability/base/grafana-oncall/kustomization.yaml +++ b/observability/base/grafana-oncall/kustomization.yaml @@ -9,6 +9,8 @@ resources: - externalsecret-sqlinstance-password.yaml - externalsecret-valkey.yaml - helmrelease-oncall.yaml + - helmrelease-rabbitmq.yaml - helmrelease-valkey.yaml - - rabbitmq.yaml + - httproute-oncall.yaml + - httproute-rabbitmq.yaml - sqlinstance.yaml diff --git a/observability/base/grafana-oncall/rabbitmq.yaml b/observability/base/grafana-oncall/rabbitmq.yaml deleted file mode 100644 index 76c505e0..00000000 --- a/observability/base/grafana-oncall/rabbitmq.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rabbitmq.com/v1beta1 -kind: RabbitmqCluster -metadata: - name: oncall-rabbitmq -spec: - replicas: 1 - secretBackend: - externalSecret: - name: "oncall-rabbitmq" diff --git a/security/base/cert-manager/vault-clusterissuer.yaml b/security/base/cert-manager/vault-clusterissuer.yaml index 82bdd775..24dd2531 100644 --- a/security/base/cert-manager/vault-clusterissuer.yaml +++ b/security/base/cert-manager/vault-clusterissuer.yaml @@ -11,7 +11,7 @@ spec: auth: appRole: path: approle - roleId: 0da9ee27-c2ea-ec82-51ea-f1f1e06597d2 # !! This value changes each time I recreate the whole platform + roleId: 8954a68e-3308-619f-44be-e6d3d282c754 # !! This value changes each time I recreate the whole platform secretRef: name: cert-manager-vault-approle key: secret_id