Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(infra): set prometheus alertmanager and add container for checking number of instances #50

Merged
merged 5 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/update-stage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
GF_SMTP_FROM_ADDRESS = ${{ secrets.FROM_ADDRESS }}
MINIO_ROOT_USER = ${{ secrets.MINIO_ROOT_USER }}
MINIO_ROOT_PASSWORD = ${{ secrets.MINIO_ROOT_PASSWORD }}
MS_WEBHOOK_URL = ${{ secrets.MS_WEBHOOK_URL }}

EOF

- name: Check if initial containers are running
Expand Down
7 changes: 7 additions & 0 deletions check-instance/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM python:3.11
ENV PYTHONUNBUFFERED 1
WORKDIR /app

# Python 스크립트 복사
COPY . .
RUN pip install requests
126 changes: 126 additions & 0 deletions check-instance/check_instance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import requests
import json
import datetime
import time
import os

# from message_card_template import message_card

# Prometheus 서버 주소
PROMETHEUS_URL = "http://prometheus:9090"

# MS Teams Incoming Webhook URL
WEBHOOK_URL = os.environ["WEBHOOK_URL"]


def fetch_metrics(query):
response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": query})
if response.status_code == 200:
return response.json()
else:
return None


if __name__ == "__main__":
print("Start check instance")

file_path = "instance.json"

if os.path.isfile(file_path):
print(f"{file_path} exists.")
else: # 인스턴스 개수를 기록하는 파일이 없으면 생성 (기본값: 1)
print(f"{file_path} does not exist.")
with open("instance.json", "w") as json_file:
data = {"api-client": 1, "api-admin": 1, "iris": 1}
json.dump(data, json_file)

while True:
# 데이터 수집
try:
backend_client_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-client-metric"}))'
result_api_client = fetch_metrics(backend_client_metric_query)
backend_admin_metric_query = 'count by (instance) (group(system_cpu_utilization{job="backend-admin-metric"}))'
result_api_admin = fetch_metrics(backend_admin_metric_query)
iris_metric_query = (
'count by (instance) (group(cpu_usage_percent{job="iris-metric"}))'
)
result_iris = fetch_metrics(iris_metric_query)

data = {}
alerts = []
print("현재 시간: ", datetime.datetime.now())

# Prometheus에서 데이터를 가져온 경우만
if result_api_client["data"]["result"]:
result = result_api_client["data"]["result"][0]["value"][1]
data["api-client"] = result
print("Client 인스턴스 개수: ", (result))

if result_api_admin["data"]["result"]:
result = result_api_admin["data"]["result"][0]["value"][1]
data["api-admin"] = result
print("Admin 인스턴스 개수: ", (result))

if result_iris["data"]["result"]:
result = result_iris["data"]["result"][0]["value"][1]
data["iris"] = result
print("Iris 인스턴스 개수: ", (result))

# 이전 데이터와 비교
with open("instance.json", "r") as json_file:
before_data = json.load(json_file)

# Prometheus에서 데이터를 가져온 경우 & 이전 데이터와 다른 경우
for key, value in data.items():
print(key)
if key == "api-client":
if data["api-client"] != before_data["api-client"]:
print("api-client 인스턴스 개수 변경")
alerts.append(
(
"Client API",
int(data["api-client"]),
int(before_data["api-client"]),
)
)
before_data["api-client"] = data["api-client"]
if key == "api-admin":
if data["api-admin"] != before_data["api-admin"]:
print("api-admin 인스턴스 개수 변경")
alerts.append(
(
"Admin API",
int(data["api-admin"]),
int(before_data["api-admin"]),
)
)
before_data["api-admin"] = data["api-admin"]
if key == "iris":
if data["iris"] != before_data["iris"]:
print("iris 인스턴스 개수 변경")
alerts.append(
("Iris", int(data["iris"]), int(before_data["iris"]))
)
before_data["iris"] = data["iris"]

if alerts:
with open("instance.json", "w") as json_file:
json.dump(before_data, json_file) # 변경된 인스턴스 개수 저장

message = ""
for alert in alerts:
if (alert[1] - alert[2]) > 0:
message += f"{alert[0]} 인스턴스가 {alert[1]-alert[2]}개 증가하였습니다: {alert[2]}개 -> {alert[1]}개\n"
else:
message += f"{alert[0]} 인스턴스 {alert[2]-alert[1]}개 감소하였습니다: {alert[2]}개 -> {alert[1]}개\n"
payload = {
"title": "인스턴스 개수 변경 알림",
"text": message,
}
requests.post(WEBHOOK_URL, json=payload)

except Exception as e:
print(f"Error: {e}")

# 1분마다 데이터 수집
time.sleep(60)
13 changes: 13 additions & 0 deletions config/alertmanager/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
global:
resolve_timeout: 1m

receivers:
- name: skkuding-msteams
webhook_configs:
- send_resolved: true
url: 'http://prometheus-msteams:2000/alertmanager'
route:
group_interval: 5m
k1g99 marked this conversation as resolved.
Show resolved Hide resolved
group_wait: 30s
repeat_interval: 30s
receiver: skkuding-msteams
59 changes: 59 additions & 0 deletions config/prometheus/rules/cpu_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: cpu_alerts_per_container
rules:
- alert: HighCpuUsageClientAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 5
for: 1m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageClientAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-client-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-client-metric"}[5m]))) > 90
for: 1m
labels:
severity: 'critical'
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Client API"
description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'


- alert: HighCpuUsageAdminAPIWarning
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
description: 'CPU usage is above 80% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

- alert: HighCpuUsageAdminAPICritical
expr: (sum by (instance) (avg_over_time(system_cpu_utilization{job="backend-admin-metric", system_cpu_state!="idle"}[5m])) + sum by (instance) (avg_over_time(process_cpu_utilization{job="backend-admin-metric"}[5m]))) > 90
for: 1m
labels:
severity: 'critical'
annotations:
summary: "High CPU usage detected on {{ $labels.instance }} of Admin API"
description: 'CPU usage is above 90% for 1 minute (currently {{ $value | printf "%.2f" }}%)'
value: '{{ $value | printf "%.2f" }}'

# - name: cpu_alerts per instance
# rules:
# - alert: HighCpuUsage
# expr: sum by (instance) (rate(process_cpu_seconds_total{job="backend-client-metric"}[5m])) > 0.8
# for: 1m
# labels:
# severity: warning
# annotations:
# summary: "High CPU usage detected on {{ $labels.instance }}"
# description: "CPU usage is above 80% for 1 minute (currently {{ $value }}%)"

# - name: memory_alerts
# rules:
# - alert: HighMemoryUsage
# expr:
40 changes: 40 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,50 @@ services:
- "9090:9090"
volumes:
- "$PWD/config/prometheus/prometheus-config.yml:/etc/prometheus/prometheus-config.yml"
- "$PWD/config/prometheus/rules:/etc/prometheus/rules"
- prometheus_data_volume:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus-config.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-remote-write-receiver'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
restart: always

alertmanager:
profiles: ["metric"]
image: prom/alertmanager
container_name: alertmanager
volumes:
- "$PWD/config/alertmanager/config.yml:/etc/alertmanager/config.yml"
command:
- '--config.file=/etc/alertmanager/config.yml'
ports:
- "9093:9093"
k1g99 marked this conversation as resolved.
Show resolved Hide resolved
restart: always

prometheus-msteams:
profiles: ["metric"]
image: docker.io/bzon/prometheus-msteams:v1.1.4
container_name: prometheus-msteams
restart: always
environment:
- TEAMS_INCOMING_WEBHOOK_URL=${MS_WEBHOOK_URL}
- TEAMS_REQUEST_URI=alertmanager
expose:
- "2000"

check-instance:
profiles: ["metric"]
build: ./check-instance
environment:
- WEBHOOK_URL=${MS_WEBHOOK_URL}
container_name: check-instance
command: ["python", "check_instance.py"]
volumes:
- check_instance_data_volume:/app
depends_on:
- prometheus
restart: always

tempo:
Expand Down Expand Up @@ -130,3 +169,4 @@ volumes:
grafana_storage: {}
minio_data_volume:
prometheus_data_volume:
check_instance_data_volume: