From ce01bcd4511e35f215e15158b8af06efee3350c4 Mon Sep 17 00:00:00 2001
From: Eunsu Kang <56429615+ssupecial@users.noreply.github.com>
Date: Thu, 23 May 2024 18:37:12 +0900
Subject: [PATCH] feat(infra): set prometheus alertmanager and add container
 for checking number of instances (#50)

* feat(infra): add alertmanager container

* feat(infra): define prometheus cpu alert rules

* feat(infra): add alertmanager config file

* feat(infra): add github secrets of ms teams webhook url variable to github actions

* feat(infra): add docker container for checking number of instance, each cluster
---
 .github/workflows/update-stage.yml    |   2 +
 check-instance/Dockerfile             |   7 ++
 check-instance/check_instance.py      | 126 ++++++++++++++++++++++++++
 config/alertmanager/config.yml        |  13 +++
 config/prometheus/rules/cpu_rules.yml |  59 ++++++++++++
 docker-compose.yml                    |  40 ++++++++
 6 files changed, 247 insertions(+)
 create mode 100644 check-instance/Dockerfile
 create mode 100644 check-instance/check_instance.py
 create mode 100644 config/alertmanager/config.yml
 create mode 100644 config/prometheus/rules/cpu_rules.yml

diff --git a/.github/workflows/update-stage.yml b/.github/workflows/update-stage.yml
index 80effa9..b7cfc52 100644
--- a/.github/workflows/update-stage.yml
+++ b/.github/workflows/update-stage.yml
@@ -21,6 +21,8 @@ jobs:
           GF_SMTP_FROM_ADDRESS = ${{ secrets.FROM_ADDRESS }}
           MINIO_ROOT_USER = ${{ secrets.MINIO_ROOT_USER }}
           MINIO_ROOT_PASSWORD = ${{ secrets.MINIO_ROOT_PASSWORD }}
+          MS_WEBHOOK_URL = ${{ secrets.MS_WEBHOOK_URL }}
+
           EOF
 
       - name: Check if initial containers are running
diff --git a/check-instance/Dockerfile b/check-instance/Dockerfile
new file mode 100644
index 0000000..a43322b
--- /dev/null
+++ b/check-instance/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.11
+ENV PYTHONUNBUFFERED 1
+WORKDIR /app
+
+# Python 스크립트 복사
+COPY . .
+RUN pip install requests
diff --git a/check-instance/check_instance.py b/check-instance/check_instance.py
new file mode 100644
index 0000000..e2178e9
--- /dev/null
+++ b/check-instance/check_instance.py
@@ -0,0 +1,126 @@
+import requests
+import json
+import datetime
+import time
+import os
+
+# from message_card_template import message_card
+
+# Prometheus 서버 주소
+PROMETHEUS_URL = "http://prometheus:9090"
+
+# MS Teams Incoming Webhook URL
+WEBHOOK_URL = os.environ["WEBHOOK_URL"]
+
+
+def fetch_metrics(query):
+    response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query})
+    if response.status_code == 200:
+        return response.json()
+    else:
+        return None
+
+
+if __name__ == "__main__":
+    print("Start check instance")
+
+    file_path = "instance.json"
+
+    if os.path.isfile(file_path):
+        print(f"{file_path} exists.")
+    else:  # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1)
+        print(f"{file_path} does not exist.")
+        with open("instance.json", "w") as json_file:
+            data = {"api-client": 1, "api-admin": 1, "iris": 1}
+            json.dump(data, json_file)
+
+    while True:
+        # 데이터 수집
+        try:
+            backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))'
+            result_api_client = fetch_metrics(backend_client_metric_query)
+            backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))'
+            result_api_admin = fetch_metrics(backend_admin_metric_query)
+            iris_metric_query = (
+                'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))'
+            )
+            result_iris = fetch_metrics(iris_metric_query)
+
+            data = {}
+            alerts = []
+            print("현재 시간: ", datetime.datetime.now())
+
+            # Prometheus에서 데이터를 가져온 경우만
+            if result_api_client["data"]["result"]:
+                result = result_api_client["data"]["result"][0]["value"][1]
+                data["api-client"] = result
+                print("Client 인스턴스 개수: ", (result))
+
+            if result_api_admin["data"]["result"]:
+                result = result_api_admin["data"]["result"][0]["value"][1]
+                data["api-admin"] = result
+                print("Admin 인스턴스 개수: ", (result))
+
+            if result_iris["data"]["result"]:
+                result = result_iris["data"]["result"][0]["value"][1]
+                data["iris"] = result
+                print("Iris 인스턴스 개수: ", (result))
+
+            # 이전 데이터와 비교
+            with open("instance.json", "r") as json_file:
+                before_data = json.load(json_file)
+
+            # Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우
+            for key, value in data.items():
+                print(key)
+                if key == "api-client":
+                    if data["api-client"] != before_data["api-client"]:
+                        print("api-client 인스턴스 개수 변경")
+                        alerts.append(
+                            (
+                                "Client API",
+                                int(data["api-client"]),
+                                int(before_data["api-client"]),
+                            )
+                        )
+                        before_data["api-client"] = data["api-client"]
+                if key == "api-admin":
+                    if data["api-admin"] != before_data["api-admin"]:
+                        print("api-admin 인스턴스 개수 변경")
+                        alerts.append(
+                            (
+                                "Admin API",
+                                int(data["api-admin"]),
+                                int(before_data["api-admin"]),
+                            )
+                        )
+                        before_data["api-admin"] = data["api-admin"]
+                if key == "iris":
+                    if data["iris"] != before_data["iris"]:
+                        print("iris 인스턴스 개수 변경")
+                        alerts.append(
+                            ("Iris", int(data["iris"]), int(before_data["iris"]))
+                        )
+                        before_data["iris"] = data["iris"]
+
+            if alerts:
+                with open("instance.json", "w") as json_file:
+                    json.dump(before_data, json_file)  # 변경된 인스턴스 개수 저장
+
+                message = ""
+                for alert in alerts:
+                    if (alert[1] - alert[2]) > 0:
+                        message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}개\n"
+                    else:
+                        message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}개\n"
+                payload = {
+                    "title": "인스턴스 개수 변경 알림",
+                    "text": message,
+                }
+                requests.post(WEBHOOK_URL, json=payload)
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+        # 1분마다 데이터 수집
+        time.sleep(60)
diff --git a/config/alertmanager/config.yml b/config/alertmanager/config.yml
new file mode 100644
index 0000000..3e2d7a7
--- /dev/null
+++ b/config/alertmanager/config.yml
@@ -0,0 +1,13 @@
+global:
+  resolve_timeout: 1m
+
+receivers:
+  - name: skkuding-msteams
+    webhook_configs:
+      - send_resolved: true
+        url: 'http://prometheus-msteams:2000/alertmanager'
+route:
+  group_interval: 5m
+  group_wait: 30s
+  repeat_interval: 30s
+  receiver: skkuding-msteams
\ No newline at end of file
diff --git a/config/prometheus/rules/cpu_rules.yml b/config/prometheus/rules/cpu_rules.yml
new file mode 100644
index 0000000..be351ec
--- /dev/null
+++ b/config/prometheus/rules/cpu_rules.yml
@@ -0,0 +1,59 @@
+groups:
+  - name: cpu_alerts_per_container
+    rules:
+    - alert: HighCpuUsageClientAPIWarning
+      expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
+        description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
+        value: '{{ $value | printf "%.2f" }}'
+
+    - alert: HighCpuUsageClientAPICritical
+      expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90
+      for: 1m
+      labels:
+        severity: 'critical'
+      annotations:
+        summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
+        description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
+        value: '{{ $value | printf "%.2f" }}'
+
+
+    - alert: HighCpuUsageAdminAPIWarning
+      expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
+        description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
+        value: '{{ $value | printf "%.2f" }}'
+
+    - alert: HighCpuUsageAdminAPICritical
+      expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90
+      for: 1m
+      labels:
+        severity: 'critical'
+      annotations:
+        summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
+        description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
+        value: '{{ $value | printf "%.2f" }}'
+
+  # - name: cpu_alerts per instance 
+  #   rules:
+  #   - alert: HighCpuUsage
+  #     expr: sum by (instance) (rate(process_cpu_seconds_total{job="backend-client-metric"}[5m])) > 0.8
+  #     for: 1m
+  #     labels:
+  #       severity: warning
+  #     annotations:
+  #       summary: "High CPU usage detected on {{ $labels.instance }}"
+  #       description: "CPU usage is above 80% for 1 minute (currently {{ $value }}%)"
+
+  # - name: memory_alerts
+  #   rules:
+  #   - alert: HighMemoryUsage
+  #     expr: 
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index a58ecc1..8443152 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -85,11 +85,50 @@ services:
       - "9090:9090"
     volumes:
       - "$PWD/config/prometheus/prometheus-config.yml:/etc/prometheus/prometheus-config.yml"
+      - "$PWD/config/prometheus/rules:/etc/prometheus/rules"
       - prometheus_data_volume:/prometheus
     command: 
       - '--config.file=/etc/prometheus/prometheus-config.yml'
       - '--storage.tsdb.path=/prometheus'
       - '--web.enable-remote-write-receiver'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
+    restart: always
+
+  alertmanager:
+    profiles: ["metric"]
+    image: prom/alertmanager
+    container_name: alertmanager
+    volumes:
+      - "$PWD/config/alertmanager/config.yml:/etc/alertmanager/config.yml"
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+    ports:
+      - "9093:9093"
+    restart: always
+
+  prometheus-msteams:
+    profiles: ["metric"]
+    image: docker.io/bzon/prometheus-msteams:v1.1.4
+    container_name: prometheus-msteams
+    restart: always
+    environment:
+      - TEAMS_INCOMING_WEBHOOK_URL=${MS_WEBHOOK_URL}
+      - TEAMS_REQUEST_URI=alertmanager
+    expose:
+      - "2000"
+
+  check-instance:
+    profiles: ["metric"]
+    build: ./check-instance
+    environment:
+      - WEBHOOK_URL=${MS_WEBHOOK_URL}
+    container_name: check-instance
+    command: ["python", "check_instance.py"]
+    volumes:
+      - check_instance_data_volume:/app
+    depends_on:
+      - prometheus
     restart: always
     
   tempo:
@@ -130,3 +169,4 @@ volumes:
   grafana_storage: {}
   minio_data_volume:
   prometheus_data_volume:
+  check_instance_data_volume: