Skip to content

Commit

Permalink
chore(oncall): replace rabbitmq cr with a helmrelease
Browse files Browse the repository at this point in the history
  • Loading branch information
SmaineTF1 committed Nov 1, 2024
1 parent 705ad7e commit 356738a
Show file tree
Hide file tree
Showing 10 changed files with 160 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
- hubble-${cluster_name}.priv.${domain_name}
- vmalertmanager-${cluster_name}.priv.${domain_name}
- vmagent-${cluster_name}.priv.${domain_name}
- oncall.priv.${domain_name}
- oncall-rabbitmq.priv.${domain_name}
issuerRef:
name: vault
kind: ClusterIssuer
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/base/gapi/platform-private-gateway.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: instance
service.beta.kubernetes.io/aws-load-balancer-scheme: "internal"
service.beta.kubernetes.io/aws-load-balancer-type: "external"
external-dns.alpha.kubernetes.io/hostname: "harbor.priv.${domain_name},grafana.priv.${domain_name},vm.priv.${domain_name},headlamp.priv.${domain_name},hubble-${cluster_name}.priv.${domain_name},vmalertmanager-${cluster_name}.priv.${domain_name},vmagent-${cluster_name}.priv.${domain_name}"
external-dns.alpha.kubernetes.io/hostname: "harbor.priv.${domain_name},grafana.priv.${domain_name},vm.priv.${domain_name},headlamp.priv.${domain_name},hubble-${cluster_name}.priv.${domain_name},vmalertmanager-${cluster_name}.priv.${domain_name},vmagent-${cluster_name}.priv.${domain_name},oncall.priv.${domain_name},oncall-rabbitmq.priv.${domain_name},oncall-rabbitmq.priv.${domain_name}"
listeners:
- name: http
hostname: "*.priv.${domain_name}"
Expand Down
22 changes: 4 additions & 18 deletions observability/base/grafana-oncall/externalsecret-rabbitmq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,10 @@ spec:
name: clustersecretstore
refreshInterval: 1h
target:
name: oncall-rabbitmq
creationPolicy: Owner
deletionPolicy: Retain
template:
engineVersion: v2
type: Opaque
data: # We can create kubernetes secrets with the desired format.
default_user.conf: |
default_user = "{{ .username }}"
default_pass = "{{ .password }}"
username: "{{ .username }}"
password: "{{ .password }}"
data:
- secretKey: username
remoteRef:
key: observability/grafana/oncall-rabbitmq
property: username
- secretKey: password
remoteRef:
name: oncall-rabbitmq
dataFrom:
- extract:
conversionStrategy: Default
key: observability/grafana/oncall-rabbitmq
property: password
4 changes: 2 additions & 2 deletions observability/base/grafana-oncall/helmrelease-oncall.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ spec:
signingSecretKey: "signing_secret"

ingress:
enabled: false
enabled: true

ingress-nginx:
enabled: false
Expand Down Expand Up @@ -91,7 +91,7 @@ spec:
externalRedis:
host: oncall-valkey-master
port: 6379
username: user
username: default
existingSecret: "oncall-valkey"
passwordKey: password

Expand Down
117 changes: 117 additions & 0 deletions observability/base/grafana-oncall/helmrelease-rabbitmq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Based on https://grafana.com/docs/grafana-oncall/latest/setup/install/helm/install-scalable/
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: oncall-rabbitmq
spec:
releaseName: oncall-rabbitmq
driftDetection:
mode: enabled
chart:
spec:
chart: rabbitmq
sourceRef:
kind: HelmRepository
name: bitnami
namespace: flux-system
version: "15.0.3"
interval: 5m0s
timeout: 15m
install:
remediation:
retries: 3
values:
auth:
username: oncall
existingPasswordSecret: "oncall-rabbitmq"
existingSecretPasswordKey: "password"
existingErlangSecret: "oncall-rabbitmq"
existingSecretErlangKey: "erlang-cookie-secret"

# Todo: enable TLS
tls:
enabled: false

replicaCount: 1

resourcesPreset: "nano"

persistence:
storageClass: "gp3"
size: 8Gi

ingress:
## To be replaced with gapi
enabled: true

networkPolicy:
## To be replaced with cilium network policy
enabled: true

metrics:
enabled: true
serviceMonitor:
default:
enabled: true
perObject:
enabled: true
detailed:
enabled: true

prometheusRule:
enabled: true
namespace: "observability"
rules:
- alert: RabbitmqDown
expr: rabbitmq_up{service="{{ template "common.names.fullname" . }}"} == 0
for: 5m
labels:
severity: error
annotations:
summary: Rabbitmq down (instance {{ "{{ $labels.instance }}" }})
description: RabbitMQ node down
- alert: ClusterDown
expr: |
sum(rabbitmq_running{service="{{ template "common.names.fullname" . }}"})
< {{ .Values.replicaCount }}
for: 5m
labels:
severity: error
annotations:
summary: Cluster down (instance {{ "{{ $labels.instance }}" }})
description: |
Less than {{ .Values.replicaCount }} nodes running in RabbitMQ cluster
VALUE = {{ "{{ $value }}" }}
- alert: ClusterPartition
expr: rabbitmq_partitions{service="{{ template "common.names.fullname" . }}"} > 0
for: 5m
labels:
severity: error
annotations:
summary: Cluster partition (instance {{ "{{ $labels.instance }}" }})
description: |
Cluster partition
VALUE = {{ "{{ $value }}" }}
- alert: OutOfMemory
expr: |
rabbitmq_node_mem_used{service="{{ template "common.names.fullname" . }}"}
/ rabbitmq_node_mem_limit{service="{{ template "common.names.fullname" . }}"}
* 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: Out of memory (instance {{ "{{ $labels.instance }}" }})
description: |
Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ "{{ $value }}" }}
LABELS: {{ "{{ $labels }}" }}
- alert: TooManyConnections
expr: rabbitmq_connectionsTotal{service="{{ template "common.names.fullname" . }}"} > 1000
for: 5m
labels:
severity: warning
annotations:
summary: Too many connections (instance {{ "{{ $labels.instance }}" }})
description: |
RabbitMQ instance has too many connections (> 1000)
VALUE = {{ "{{ $value }}" }}\n LABELS: {{ "{{ $labels }}" }}
15 changes: 15 additions & 0 deletions observability/base/grafana-oncall/httproute-oncall.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: oncall
namespace: observability
spec:
parentRefs:
- name: platform-private
namespace: infrastructure
hostnames:
- "oncall.priv.${domain_name}"
rules:
- backendRefs:
- name: oncall-engine
port: 8080
15 changes: 15 additions & 0 deletions observability/base/grafana-oncall/httproute-rabbitmq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: oncall-rabbitmq
namespace: observability
spec:
parentRefs:
- name: platform-private
namespace: infrastructure
hostnames:
- "oncall-rabbitmq.priv.${domain_name}"
rules:
- backendRefs:
- name: oncall-rabbitmq
port: 15672
4 changes: 3 additions & 1 deletion observability/base/grafana-oncall/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ resources:
- externalsecret-sqlinstance-password.yaml
- externalsecret-valkey.yaml
- helmrelease-oncall.yaml
- helmrelease-rabbitmq.yaml
- helmrelease-valkey.yaml
- rabbitmq.yaml
- httproute-oncall.yaml
- httproute-rabbitmq.yaml
- sqlinstance.yaml
9 changes: 0 additions & 9 deletions observability/base/grafana-oncall/rabbitmq.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion security/base/cert-manager/vault-clusterissuer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
auth:
appRole:
path: approle
roleId: 0da9ee27-c2ea-ec82-51ea-f1f1e06597d2 # !! This value changes each time I recreate the whole platform
roleId: 8954a68e-3308-619f-44be-e6d3d282c754 # !! This value changes each time I recreate the whole platform
secretRef:
name: cert-manager-vault-approle
key: secret_id

0 comments on commit 356738a

Please sign in to comment.