Skip to content

Commit

Permalink
feat(infra): set prometheus alertmanager and add container for checki…
Browse files Browse the repository at this point in the history
…ng number of instances (#50)

* feat(infra): add alertmanager container

* feat(infra): define prometheus cpu alert rules

* feat(infra): add alertmanager config file

* feat(infra): add github secrets of ms teams webhook url variable to github actions

* feat(infra): add docker container for checking number of instance, each cluster
  • Loading branch information
ssupecial authored May 23, 2024
1 parent aec8ff8 commit ce01bcd
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/update-stage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
GF_SMTP_FROM_ADDRESS = ${{ secrets.FROM_ADDRESS }}
MINIO_ROOT_USER = ${{ secrets.MINIO_ROOT_USER }}
MINIO_ROOT_PASSWORD = ${{ secrets.MINIO_ROOT_PASSWORD }}
MS_WEBHOOK_URL = ${{ secrets.MS_WEBHOOK_URL }}
EOF
- name: Check if initial containers are running
Expand Down
7 changes: 7 additions & 0 deletions check-instance/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM python:3.11
ENV PYTHONUNBUFFERED 1
WORKDIR /app

# Python 스크립트 복사
COPY . .
RUN pip install requests
126 changes: 126 additions & 0 deletions check-instance/check_instance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import requests
import json
import datetime
import time
import os

# from message_card_template import message_card

# Prometheus 서버 주소
PROMETHEUS_URL = "http://prometheus:9090"

# MS Teams Incoming Webhook URL
WEBHOOK_URL = os.environ["WEBHOOK_URL"]


def fetch_metrics(query):
response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query})
if response.status_code == 200:
return response.json()
else:
return None


if __name__ == "__main__":
print("Start check instance")

file_path = "instance.json"

if os.path.isfile(file_path):
print(f"{file_path} exists.")
else: # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1)
print(f"{file_path} does not exist.")
with open("instance.json", "w") as json_file:
data = {"api-client": 1, "api-admin": 1, "iris": 1}
json.dump(data, json_file)

while True:
# 데이터 수집
try:
backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))'
result_api_client = fetch_metrics(backend_client_metric_query)
backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))'
result_api_admin = fetch_metrics(backend_admin_metric_query)
iris_metric_query = (
'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))'
)
result_iris = fetch_metrics(iris_metric_query)

data = {}
alerts = []
print("현재 시간: ", datetime.datetime.now())

# Prometheus에서 데이터를 가져온 경우만
if result_api_client["data"]["result"]:
result = result_api_client["data"]["result"][0]["value"][1]
data["api-client"] = result
print("Client 인스턴스 개수: ", (result))

if result_api_admin["data"]["result"]:
result = result_api_admin["data"]["result"][0]["value"][1]
data["api-admin"] = result
print("Admin 인스턴스 개수: ", (result))

if result_iris["data"]["result"]:
result = result_iris["data"]["result"][0]["value"][1]
data["iris"] = result
print("Iris 인스턴스 개수: ", (result))

# 이전 데이터와 비교
with open("instance.json", "r") as json_file:
before_data = json.load(json_file)

# Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우
for key, value in data.items():
print(key)
if key == "api-client":
if data["api-client"] != before_data["api-client"]:
print("api-client 인스턴스 개수 변경")
alerts.append(
(
"Client API",
int(data["api-client"]),
int(before_data["api-client"]),
)
)
before_data["api-client"] = data["api-client"]
if key == "api-admin":
if data["api-admin"] != before_data["api-admin"]:
print("api-admin 인스턴스 개수 변경")
alerts.append(
(
"Admin API",
int(data["api-admin"]),
int(before_data["api-admin"]),
)
)
before_data["api-admin"] = data["api-admin"]
if key == "iris":
if data["iris"] != before_data["iris"]:
print("iris 인스턴스 개수 변경")
alerts.append(
("Iris", int(data["iris"]), int(before_data["iris"]))
)
before_data["iris"] = data["iris"]

if alerts:
with open("instance.json", "w") as json_file:
json.dump(before_data, json_file) # 변경된 인스턴스 개수 저장

message = ""
for alert in alerts:
if (alert[1] - alert[2]) > 0:
message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}\n"
else:
message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}\n"
payload = {
"title": "인스턴스 개수 변경 알림",
"text": message,
}
requests.post(WEBHOOK_URL, json=payload)

except Exception as e:
print(f"Error: {e}")

# 1분마다 데이터 수집
time.sleep(60)
13 changes: 13 additions & 0 deletions config/alertmanager/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
global:
resolve_timeout: 1m

receivers:
- name: skkuding-msteams
webhook_configs:
- send_resolved: true
url: 'http://prometheus-msteams:2000/alertmanager'
route:
group_interval: 5m
group_wait: 30s
repeat_interval: 30s
receiver: skkuding-msteams
59 changes: 59 additions & 0 deletions config/prometheus/rules/cpu_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: cpu_alerts_per_container
rules:
- alert: HighCpuUsageClientAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5
for: 1m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageClientAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90
for: 1m
labels:
severity: 'critical'
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'


- alert: HighCpuUsageAdminAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageAdminAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90
for: 1m
labels:
severity: 'critical'
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

# - name: cpu_alerts per instance
# rules:
# - alert: HighCpuUsage
# expr: sum by (instance) (rate(process_cpu_seconds_total{job="backend-client-metric"}[5m])) > 0.8
# for: 1m
# labels:
# severity: warning
# annotations:
# summary: "High CPU usage detected on {{ $labels.instance }}"
# description: "CPU usage is above 80% for 1 minute (currently {{ $value }}%)"

# - name: memory_alerts
# rules:
# - alert: HighMemoryUsage
# expr:
40 changes: 40 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,50 @@ services:
- "9090:9090"
volumes:
- "$PWD/config/prometheus/prometheus-config.yml:/etc/prometheus/prometheus-config.yml"
- "$PWD/config/prometheus/rules:/etc/prometheus/rules"
- prometheus_data_volume:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus-config.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-remote-write-receiver'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
restart: always

alertmanager:
profiles: ["metric"]
image: prom/alertmanager
container_name: alertmanager
volumes:
- "$PWD/config/alertmanager/config.yml:/etc/alertmanager/config.yml"
command:
- '--config.file=/etc/alertmanager/config.yml'
ports:
- "9093:9093"
restart: always

prometheus-msteams:
profiles: ["metric"]
image: docker.io/bzon/prometheus-msteams:v1.1.4
container_name: prometheus-msteams
restart: always
environment:
- TEAMS_INCOMING_WEBHOOK_URL=${MS_WEBHOOK_URL}
- TEAMS_REQUEST_URI=alertmanager
expose:
- "2000"

check-instance:
profiles: ["metric"]
build: ./check-instance
environment:
- WEBHOOK_URL=${MS_WEBHOOK_URL}
container_name: check-instance
command: ["python", "check_instance.py"]
volumes:
- check_instance_data_volume:/app
depends_on:
- prometheus
restart: always

tempo:
Expand Down Expand Up @@ -130,3 +169,4 @@ volumes:
grafana_storage: {}
minio_data_volume:
prometheus_data_volume:
check_instance_data_volume:

0 comments on commit ce01bcd

Please sign in to comment.