From ce01bcd4511e35f215e15158b8af06efee3350c4 Mon Sep 17 00:00:00 2001 From: Eunsu Kang <56429615+ssupecial@users.noreply.github.com> Date: Thu, 23 May 2024 18:37:12 +0900 Subject: [PATCH] feat(infra): set prometheus alertmanager and add container for checking number of instances (#50) * feat(infra): add alertmanager container * feat(infra): define prometheus cpu alert rules * feat(infra): add alertmanager config file * feat(infra): add github secrets of ms teams webhook url variable to github actions * feat(infra): add docker container for checking number of instance, each cluster --- .github/workflows/update-stage.yml | 2 + check-instance/Dockerfile | 7 ++ check-instance/check_instance.py | 126 ++++++++++++++++++++++++++ config/alertmanager/config.yml | 13 +++ config/prometheus/rules/cpu_rules.yml | 59 ++++++++++++ docker-compose.yml | 40 ++++++++ 6 files changed, 247 insertions(+) create mode 100644 check-instance/Dockerfile create mode 100644 check-instance/check_instance.py create mode 100644 config/alertmanager/config.yml create mode 100644 config/prometheus/rules/cpu_rules.yml diff --git a/.github/workflows/update-stage.yml b/.github/workflows/update-stage.yml index 80effa9..b7cfc52 100644 --- a/.github/workflows/update-stage.yml +++ b/.github/workflows/update-stage.yml @@ -21,6 +21,8 @@ jobs: GF_SMTP_FROM_ADDRESS = ${{ secrets.FROM_ADDRESS }} MINIO_ROOT_USER = ${{ secrets.MINIO_ROOT_USER }} MINIO_ROOT_PASSWORD = ${{ secrets.MINIO_ROOT_PASSWORD }} + MS_WEBHOOK_URL = ${{ secrets.MS_WEBHOOK_URL }} + EOF - name: Check if initial containers are running diff --git a/check-instance/Dockerfile b/check-instance/Dockerfile new file mode 100644 index 0000000..a43322b --- /dev/null +++ b/check-instance/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.11 +ENV PYTHONUNBUFFERED 1 +WORKDIR /app + +# Python 스크립트 복사 +COPY . . +RUN pip install requests diff --git a/check-instance/check_instance.py b/check-instance/check_instance.py new file mode 100644 index 0000000..e2178e9 --- /dev/null +++ b/check-instance/check_instance.py @@ -0,0 +1,126 @@ +import requests +import json +import datetime +import time +import os + +# from message_card_template import message_card + +# Prometheus 서버 주소 +PROMETHEUS_URL = "http://prometheus:9090" + +# MS Teams Incoming Webhook URL +WEBHOOK_URL = os.environ["WEBHOOK_URL"] + + +def fetch_metrics(query): + response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query}) + if response.status_code == 200: + return response.json() + else: + return None + + +if __name__ == "__main__": + print("Start check instance") + + file_path = "instance.json" + + if os.path.isfile(file_path): + print(f"{file_path} exists.") + else: # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1) + print(f"{file_path} does not exist.") + with open("instance.json", "w") as json_file: + data = {"api-client": 1, "api-admin": 1, "iris": 1} + json.dump(data, json_file) + + while True: + # 데이터 수집 + try: + backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))' + result_api_client = fetch_metrics(backend_client_metric_query) + backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))' + result_api_admin = fetch_metrics(backend_admin_metric_query) + iris_metric_query = ( + 'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))' + ) + result_iris = fetch_metrics(iris_metric_query) + + data = {} + alerts = [] + print("현재 시간: ", datetime.datetime.now()) + + # Prometheus에서 데이터를 가져온 경우만 + if result_api_client["data"]["result"]: + result = result_api_client["data"]["result"][0]["value"][1] + data["api-client"] = result + print("Client 인스턴스 개수: ", (result)) + + if result_api_admin["data"]["result"]: + result = result_api_admin["data"]["result"][0]["value"][1] + data["api-admin"] = result + print("Admin 인스턴스 개수: ", (result)) + + if result_iris["data"]["result"]: + result = result_iris["data"]["result"][0]["value"][1] + data["iris"] = result + print("Iris 인스턴스 개수: ", (result)) + + # 이전 데이터와 비교 + with open("instance.json", "r") as json_file: + before_data = json.load(json_file) + + # Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우 + for key, value in data.items(): + print(key) + if key == "api-client": + if data["api-client"] != before_data["api-client"]: + print("api-client 인스턴스 개수 변경") + alerts.append( + ( + "Client API", + int(data["api-client"]), + int(before_data["api-client"]), + ) + ) + before_data["api-client"] = data["api-client"] + if key == "api-admin": + if data["api-admin"] != before_data["api-admin"]: + print("api-admin 인스턴스 개수 변경") + alerts.append( + ( + "Admin API", + int(data["api-admin"]), + int(before_data["api-admin"]), + ) + ) + before_data["api-admin"] = data["api-admin"] + if key == "iris": + if data["iris"] != before_data["iris"]: + print("iris 인스턴스 개수 변경") + alerts.append( + ("Iris", int(data["iris"]), int(before_data["iris"])) + ) + before_data["iris"] = data["iris"] + + if alerts: + with open("instance.json", "w") as json_file: + json.dump(before_data, json_file) # 변경된 인스턴스 개수 저장 + + message = "" + for alert in alerts: + if (alert[1] - alert[2]) > 0: + message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}개\n" + else: + message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}개\n" + payload = { + "title": "인스턴스 개수 변경 알림", + "text": message, + } + requests.post(WEBHOOK_URL, json=payload) + + except Exception as e: + print(f"Error: {e}") + + # 1분마다 데이터 수집 + time.sleep(60) diff --git a/config/alertmanager/config.yml b/config/alertmanager/config.yml new file mode 100644 index 0000000..3e2d7a7 --- /dev/null +++ b/config/alertmanager/config.yml @@ -0,0 +1,13 @@ +global: + resolve_timeout: 1m + +receivers: + - name: skkuding-msteams + webhook_configs: + - send_resolved: true + url: 'http://prometheus-msteams:2000/alertmanager' +route: + group_interval: 5m + group_wait: 30s + repeat_interval: 30s + receiver: skkuding-msteams \ No newline at end of file diff --git a/config/prometheus/rules/cpu_rules.yml b/config/prometheus/rules/cpu_rules.yml new file mode 100644 index 0000000..be351ec --- /dev/null +++ b/config/prometheus/rules/cpu_rules.yml @@ -0,0 +1,59 @@ +groups: + - name: cpu_alerts_per_container + rules: + - alert: HighCpuUsageClientAPIWarning + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5 + for: 1m + labels: + severity: warning + annotations: + summary: "High CPU usage detected on {{ $labels.instance }} of Client API" + description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)' + value: '{{ $value | printf "%.2f" }}' + + - alert: HighCpuUsageClientAPICritical + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90 + for: 1m + labels: + severity: 'critical' + annotations: + summary: "High CPU usage detected on {{ $labels.instance }} of Client API" + description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)' + value: '{{ $value | printf "%.2f" }}' + + + - alert: HighCpuUsageAdminAPIWarning + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80 + for: 1m + labels: + severity: warning + annotations: + summary: "High CPU usage detected on {{ $labels.instance }} of Admin API" + description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)' + value: '{{ $value | printf "%.2f" }}' + + - alert: HighCpuUsageAdminAPICritical + expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90 + for: 1m + labels: + severity: 'critical' + annotations: + summary: "High CPU usage detected on {{ $labels.instance }} of Admin API" + description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)' + value: '{{ $value | printf "%.2f" }}' + + # - name: cpu_alerts per instance + # rules: + # - alert: HighCpuUsage + # expr: sum by (instance) (rate(process_cpu_seconds_total{job="backend-client-metric"}[5m])) > 0.8 + # for: 1m + # labels: + # severity: warning + # annotations: + # summary: "High CPU usage detected on {{ $labels.instance }}" + # description: "CPU usage is above 80% for 1 minute (currently {{ $value }}%)" + + # - name: memory_alerts + # rules: + # - alert: HighMemoryUsage + # expr: \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index a58ecc1..8443152 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,11 +85,50 @@ services: - "9090:9090" volumes: - "$PWD/config/prometheus/prometheus-config.yml:/etc/prometheus/prometheus-config.yml" + - "$PWD/config/prometheus/rules:/etc/prometheus/rules" - prometheus_data_volume:/prometheus command: - '--config.file=/etc/prometheus/prometheus-config.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-remote-write-receiver' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + restart: always + + alertmanager: + profiles: ["metric"] + image: prom/alertmanager + container_name: alertmanager + volumes: + - "$PWD/config/alertmanager/config.yml:/etc/alertmanager/config.yml" + command: + - '--config.file=/etc/alertmanager/config.yml' + ports: + - "9093:9093" + restart: always + + prometheus-msteams: + profiles: ["metric"] + image: docker.io/bzon/prometheus-msteams:v1.1.4 + container_name: prometheus-msteams + restart: always + environment: + - TEAMS_INCOMING_WEBHOOK_URL=${MS_WEBHOOK_URL} + - TEAMS_REQUEST_URI=alertmanager + expose: + - "2000" + + check-instance: + profiles: ["metric"] + build: ./check-instance + environment: + - WEBHOOK_URL=${MS_WEBHOOK_URL} + container_name: check-instance + command: ["python", "check_instance.py"] + volumes: + - check_instance_data_volume:/app + depends_on: + - prometheus restart: always tempo: @@ -130,3 +169,4 @@ volumes: grafana_storage: {} minio_data_volume: prometheus_data_volume: + check_instance_data_volume: