Skip to content

Commit

Permalink
merge with main
Browse files Browse the repository at this point in the history
  • Loading branch information
mferracc committed Sep 23, 2024
2 parents 6884bac + b138d44 commit 03ae17b
Show file tree
Hide file tree
Showing 15 changed files with 2,399 additions and 22 deletions.
5 changes: 4 additions & 1 deletion .env_template
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
POSTGRES_USER=''
POSTGRES_PASSWORD=''
POSTGRES_DB=''
SERV_IP=''
SERV_IP=''
OAUTH_CLIENT_UID=''
OAUTH_CLIENT_SECRET=''
DOCKSOCKUID=$(id -u)
4 changes: 4 additions & 0 deletions .github/workflows/docker-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ jobs:
echo POSTGRES_PASSWORD="${{ secrets.POSTGRES_PASSWORD }}";
echo POSTGRES_DB="${{ secrets.POSTGRES_DB }}";
echo PGDATA="$PWD/db/data";
echo DOCKSOCKUID="$(id -u)";
} >> .env
- name: Create discord webhook file for alertmanager
run: echo "${{ secrets.DISCORD_WEBHOOK }}" > ./monitoring/alertmanager/very-secret-webhook-url

- name: Install mkcert
run: |
sudo apt-get install -y libnss3-tools
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,5 @@ ssl
**/ssl
*.crt
*.key

very-secret-webhook-url
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ BROWSER = firefox

SHELL = bash

CONTAINERS = back_auth back_user back_game back_aipi back_friends front db prometheus grafana node_exporter blackbox_exporter redis minio
CONTAINERS = back_auth back_user back_game back_aipi front db prometheus grafana alertmanager cadvisor node_exporter blackbox_exporter redis back_friends minio

COMPOSE_PATH = docker-compose.yml

Expand Down
2 changes: 1 addition & 1 deletion back/transcendence_django/transcendence_django/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
OAUTH_REDIRECT_URI = f"https://{SERV_IP}:4200/oauth-callback"
OAUTH_TOKEN_URL = "https://api.intra.42.fr/oauth/token"

ALLOWED_HOSTS = [SERV_IP, "0.0.0.0", "minio", "back-aipi", "back-user"]
ALLOWED_HOSTS = [SERV_IP, "0.0.0.0", "back-auth", "back-game", "back-aipi", "back-user", "minio", "back-friends"]

# Application definition

Expand Down
1 change: 1 addition & 0 deletions back/transcendence_django/transcendence_django/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
path("game/", include("back_game.app_settings.urls")),
path("aipi/", include("back_aipi.urls")),
path("friends/", include("back_friends.urls")),
path("health/", include("health_check.urls")),
]
73 changes: 70 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ services:
ports:
- "8000:443"
networks:
- transcendence
transcendence:
aliases:
- back-auth
restart: unless-stopped
env_file: .env

Expand Down Expand Up @@ -163,13 +165,16 @@ services:
volumes:
- prometheus-data:/prometheus
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./monitoring/alertmanager/alert.rules/alerts.rules.yml:/alertmanager/alert.rules/alerts.rules.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=365d'
ports:
- "9090:9090"
depends_on:
alertmanager:
condition: service_healthy
blackbox:
condition: service_healthy
node_exporter:
Expand All @@ -188,6 +193,7 @@ services:
container_name: grafana
environment:
- GF_PATHS_CONFIG=/etc/grafana/grafana.ini
- DS_PROMETHEUS=prometheus
user: root
deploy:
replicas: 1
Expand All @@ -211,8 +217,63 @@ services:
networks:
- transcendence

cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
command:
# - --logtostderr --v=4
- '--housekeeping_interval=5s'
- '--docker_only'
user: root
group_add:
- 999
restart: unless-stopped
privileged: true
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080"]
interval: 10s
timeout: 1s
retries: 5
start_period: 5s
volumes:
- /run/user/${DOCKSOCKUID:?Please export the id at which the docker sock can be accessed}:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /etc/machine-id:/etc/machine-id:ro
- /:/rootfs:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg:/dev/kmsg
cap_add:
- SYSLOG
- SYS_ADMIN
security_opt:
- seccomp:unconfined
ports:
- "8080:8080"
networks:
- transcendence

alertmanager:
image: prom/alertmanager:main # current stable version 0.27.0 cannot parse webhook_url_file
container_name: alertmanager
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093"]
interval: 5s
timeout: 1s
retries: 10
start_period: 5s
volumes:
- ./monitoring/alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
ports:
- "9093:9093"
networks:
- transcendence

blackbox:
image: prom/blackbox-exporter
image: prom/blackbox-exporter:master
container_name: blackbox_exporter
deploy:
replicas: 1
Expand All @@ -225,7 +286,13 @@ services:
volumes:
- ./monitoring/blackbox:/config
networks:
- transcendence
transcendence:
aliases:
- blackbox
command:
- '--config.file=/config/blackbox.yml'
ports:
- "9115:9115"

node_exporter:
image: prom/node-exporter:v0.18.1
Expand Down
219 changes: 219 additions & 0 deletions monitoring/alertmanager/alert.rules/alerts.rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
groups:
- name: Node Exporter Alerts
rules:
- alert: NodeHasSwapIn
expr: irate(node_vmstat_pswpin{job="prod"}[5m]) > 0
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Node has swap in (instance {{ $labels.instance }})"
description: "<!channel> VM SWAP (in) LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: NodeHasSwapOut
expr: irate(node_vmstat_pswpin{job="prod"}[5m]) > 0
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Node has swap out (instance {{ $labels.instance }})"
description: "<!channel> VM SWAP (out) LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: OutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Out of inodes (instance {{ $labels.instance }})"
description: "<!channel> Disk almost out of inodes (< 10% remaining) VALUE = {{ humanize $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"


- alert: OutOfMemory
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Out of memory (instance {{ $labels.instance }})"
description: "<!channel> Host almost out of RAM (< 10% remaining) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"


- alert: UnusualNetworkThroughputIn
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Unusual network throughput in (instance {{ $labels.instance }})"
description: "<!channel> Network interfaces receiving unusually high amounts of data (> 100 MB/s) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: UnusualNetworkThroughputOut
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Unusual network throughput out (instance {{ $labels.instance }})"
description: "<!channel> Network interfaces sending unusually high amounts of data (> 100 MB/s) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: UnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
description: "<!channel> Disk reads unusually high amounts of data (> 50 MB/s) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: UnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
description: "<!channel> Disk writes unusually high amounts of data (> 50 MB/s) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: OutOfDiskSpace
expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 20
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Out of disk space (instance {{ $labels.instance }})"
description: "<!channel> Disk almost out of space (< 20% remaining) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: UnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Unusual disk read latency (instance {{ $labels.instance }})"
description: "<!channel> Disk has unusually high latency (read operations > 100ms) VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: CpuLoad
expr: node_load15 / (count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 2
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "CPU load (instance {{ $labels.instance }})"
description: "<!channel> CPU load (15m) is high VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: ContextSwitching
expr: rate(node_context_switches_total[5m]) > ((count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) * 1500)
for: 15m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Context switching (instance {{ $labels.instance }})"
description: "<!channel> Host context switching unusually high VALUE = {{ $value }} LABELS: {{ $labels.job }} : {{ $labels.instance }}"

- alert: Health endpoint down
expr: probe_success == 0
for: 10s
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "Health endpoint is down"
description: "<!channel> URL does not respond : {{ $labels.addresse }}"

- alert: Certificate expired
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30
for: 10m
labels:
severity: critical
annotations:
identifier: "{{ $labels.instance }}"
summary: "SSL certificate expire"
description: "<!channel> SSL certificate expires in 3 months {{ $labels.addresse }}"

- name: GoogleCadvisor Alerts

rules:
- alert: ContainerKilled
expr: 'time() - container_last_seen > 60'
for: 2m
labels:
severity: warning
annotations:
summary: Container killed (instance {{ $labels.instance }})
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerAbsent
expr: 'absent(container_last_seen)'
for: 5m
labels:
severity: warning
annotations:
summary: Container absent (instance {{ $labels.instance }})
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerHighCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerHighMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container High Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerVolumeUsage
expr: '(1 - (sum(container_fs_inodes_free{name!="cadvisor"}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container Volume usage (instance {{ $labels.instance }})
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerHighThrottleRate
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
for: 5m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerLowCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
for: 7d
labels:
severity: info
annotations:
summary: Container Low CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: ContainerLowMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
for: 7d
labels:
severity: info
annotations:
summary: Container Low Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Loading

0 comments on commit 03ae17b

Please sign in to comment.