Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/webhook #740

Merged
merged 5 commits into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
swankit==0.1.1b3
swanboard==0.1.4b2
swanboard==0.1.6
cos-python-sdk-v5
urllib3>=1.26.0
requests>=2.25.0
Expand Down
24 changes: 14 additions & 10 deletions swanlab/data/callback_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,20 @@
@Description:
基本回调函数注册表,此时不考虑云端情况
"""
from swankit.core import SwanLabSharedSettings
from swanlab.log import swanlog
from swanlab.data.run.main import get_run, SwanLabRunState
from swanlab.data.run.callback import SwanLabRunCallback
from swankit.callback import RuntimeInfo, MetricInfo
from swankit.log import FONT
from swanlab.env import SwanLabEnv
from datetime import datetime
import traceback
import json
import os
import sys
import traceback
from datetime import datetime

from swankit.callback import RuntimeInfo, MetricInfo
from swankit.core import SwanLabSharedSettings
from swankit.log import FONT

from swanlab.data.run.callback import SwanLabRunCallback
from swanlab.data.run.main import get_run, SwanLabRunState
from swanlab.env import SwanLabEnv
from swanlab.log import swanlog


class LocalRunCallback(SwanLabRunCallback):
Expand Down Expand Up @@ -55,6 +57,7 @@ def _init_logdir(logdir: str = None) -> str:
根据传入的logdir,初始化日志文件夹
FIXME shit code
"""
env_key = SwanLabEnv.SWANLOG_FOLDER.value
# 如果传入了logdir,则将logdir设置为环境变量,代表日志文件存放的路径
if logdir is not None:
try:
Expand All @@ -73,7 +76,6 @@ def _init_logdir(logdir: str = None) -> str:
raise ValueError("logdir must be a str.")
except IOError:
raise IOError("logdir must be a path and have Write permission.")
os.environ[SwanLabEnv.SWANLOG_FOLDER.value] = logdir
# 如果没有传入logdir,则使用默认的logdir, 即当前工作目录下的swanlog文件夹,但是需要保证目录存在
else:
logdir = os.environ.get(SwanLabEnv.SWANLOG_FOLDER.value) or os.path.join(os.getcwd(), "swanlog")
Expand All @@ -84,6 +86,8 @@ def _init_logdir(logdir: str = None) -> str:
raise IOError
except IOError:
raise IOError("logdir must have Write permission.")
# 同步环境变量
os.environ[env_key] = logdir
# 如果logdir是空的,创建.gitignore文件,写入*
if not os.listdir(logdir):
with open(os.path.join(logdir, ".gitignore"), "w", encoding="utf-8") as f:
Expand Down
18 changes: 6 additions & 12 deletions swanlab/data/run/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
回调函数操作员,批量处理回调函数的调用
"""
from typing import List, Union, Dict, Any, Tuple
from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, OperateErrorInfo, RuntimeInfo

from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, RuntimeInfo
from swankit.core import SwanLabSharedSettings
import swanlab.error as E
from swankit.log import FONT

from swanlab.data.run.webhook import try_send_webhook

OperatorReturnType = Dict[str, Any]

Expand Down Expand Up @@ -83,15 +84,8 @@ def before_init_experiment(
return self.__run_all("before_init_experiment", run_id, exp_name, description, num, colors)

def on_run(self):
try:
return self.__run_all("on_run")
except E.ApiError as e:
FONT.brush("", 50)
if e.resp.status_code == 409:
error = OperateErrorInfo("The experiment name already exists, please change the experiment name")
return self.__run_all("on_run_error_from_operator", error)
else:
raise e
self.__run_all("on_run")
try_send_webhook()

def on_runtime_info_update(self, r: RuntimeInfo):
return self.__run_all("on_runtime_info_update", r)
Expand Down
4 changes: 2 additions & 2 deletions swanlab/data/run/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from .config import SwanLabConfig
from .exp import SwanLabExp
from .helper import SwanLabRunOperator, RuntimeInfo
from .metadata import get_requirements, get_metadata
from .public import SwanLabPublicConfig
from .system import get_system_info, get_requirements
from ..formater import check_key_format, check_exp_name_format, check_desc_format

MAX_LIST_LENGTH = 108
Expand Down Expand Up @@ -134,7 +134,7 @@ def _(state: SwanLabRunState):
self.__operator.on_runtime_info_update(
RuntimeInfo(
requirements=get_requirements(),
metadata=get_system_info(get_package_version(), self.__settings.log_dir),
metadata=get_metadata(self.__settings.log_dir),
)
)

Expand Down
30 changes: 30 additions & 0 deletions swanlab/data/run/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
@author: cunyue
@file: __init__.py
@time: 2024/11/18 15:02
@description: 实验元信息采集
"""

from swanlab.data.run.metadata.cooperation import get_cooperation_info
from swanlab.data.run.metadata.hardware import get_hardware_info
from swanlab.data.run.metadata.requirements import get_requirements
from swanlab.data.run.metadata.runtime import get_runtime_info


def get_metadata(logdir: str):
"""
采集实验的全部信息
"""
coop = get_cooperation_info()
return {
**get_hardware_info(),
**get_runtime_info(),
"swanlab": {
"version": coop["swanlab"]["version"],
"logdir": logdir,
"_coop": coop,
},
}


__all__ = ["get_metadata", "get_requirements", "get_cooperation_info"]
36 changes: 36 additions & 0 deletions swanlab/data/run/metadata/coop/qing_cloud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
@author: cunyue
@file: qing_cloud.py
@time: 2024/11/18 15:14
@description: 青云(https://www.qingcloud.com/)元信息采集
"""

import os


BASE_KEYS = ['AICP_PLATFORM', 'AICP_TYPE', 'AICP_NAME', 'AICP_USER_NAME']
RESOURCES_KEYS = [
'AICP_SPEC_COUNT',
'AICP_SPEC_GPU',
'AICP_SPEC_CPU',
'AICP_SPEC_MEMORY',
'AICP_SPEC_GPU_NAME',
'AICP_SPEC_GPU_TYPE',
'AICP_SPEC_GPU_MEMORY',
'AICP_HOSTNAME',
'AICP_HOST_MACHINE',
]


def get_qing_cloud_info():
plat = os.getenv("AICP_PLATFORM")
if not plat:
return None
return {**get_envs_by_keys(BASE_KEYS), "resources": get_envs_by_keys(RESOURCES_KEYS)}


def get_envs_by_keys(keys: list):
"""
通过keys获取环境变量,最终返回一个dict,key为keys的值(小写),value为环境变量的值
"""
return {key.lower(): os.getenv(key) for key in keys}
36 changes: 36 additions & 0 deletions swanlab/data/run/metadata/cooperation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
@author: cunyue
@file: official.py
@time: 2024/11/18 15:13
@description: swanlab官方合作信息
"""

import os

from swanlab.api import get_http
from swanlab.data.run.metadata.coop.qing_cloud import get_qing_cloud_info
from swanlab.env import SwanLabEnv
from swanlab.package import get_experiment_url
from swanlab.package import get_package_version


def get_cooperation_info():
qing_cloud = get_qing_cloud_info()
coop = {"swanlab": get_swanlab_info()}
if qing_cloud:
coop.update({"qing_cloud": qing_cloud})
return coop


def get_swanlab_info():
data = {
"version": get_package_version(),
"mode": os.getenv(SwanLabEnv.MODE.value),
"swanlog_dir": os.getenv(SwanLabEnv.SWANLOG_FOLDER.value),
}
try:
http = get_http()
data["exp_url"] = get_experiment_url(http.username, http.projname, http.exp_id)
except ValueError:
pass
return data
173 changes: 173 additions & 0 deletions swanlab/data/run/metadata/hardware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""
@author: cunyue
@file: hardware.py
@time: 2024/11/18 15:12
@description: 硬件信息采集
"""

import json
import multiprocessing
import platform
import subprocess

import psutil
import pynvml


def get_hardware_info():
"""
采集硬件信息,包括CPU、GPU、内存、硬盘等
"""
info = {
"memory": get_memory_size(),
"cpu": get_cpu_info(),
"gpu": {
"nvidia": get_nvidia_gpu_info(),
},
"soc": {
"apple": get_apple_chip_info(),
},
}
return info


# ---------------------------------- cpu信息 ----------------------------------


def get_cpu_info():
"""获取 CPU 信息"""
info = {"brand": None, "cores": None}

# 获取 CPU 品牌, 根据不同操作系统调用不同的函数
if platform.system() == "Windows":
info["brand"] = get_cpu_brand_windows()
elif platform.system() == "Linux":
info["brand"] = get_cpu_brand_linux()
else:
# 其他情况,暂时不支持
# 苹果芯片单独处理
return None
try:
# 获取 CPU 核心数
info["cores"] = multiprocessing.cpu_count()
except Exception: # noqa
pass

return info


def get_cpu_brand_windows():
try:
# 使用 WMIC 命令获取 CPU 品牌
result = subprocess.run(["wmic", "cpu", "get", "name"], capture_output=True, text=True)
cpu_brand = result.stdout.strip().split("\n")[-1].strip()
return cpu_brand
except Exception: # noqa
return None


def get_cpu_brand_linux():
try:
# 使用 lscpu 命令获取 CPU 品牌
result = subprocess.run(["lscpu"], capture_output=True, text=True)
for line in result.stdout.split("\n"):
if "Model name:" in line:
cpu_brand = line.split(":")[1].strip()
return cpu_brand
return None
except Exception: # noqa
return None


# ---------------------------------- 内存信息 ----------------------------------


def get_memory_size():
"""获取内存大小"""
try:
# 获取系统总内存大小
mem = psutil.virtual_memory()
total_memory = round(mem.total / (1024**3)) # 单位为GB
return total_memory
except Exception: # noqa
return


# ---------------------------------- gpu信息 ----------------------------------


def get_nvidia_gpu_info():
"""获取 GPU 信息"""

def get_cuda_version():
"""获取 CUDA 版本"""
try:
output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
for line in output.split('\n'):
if "release" in line:
version = line.split("release")[-1].strip().split(" ")[0][:-1]
return version
except Exception: # noqa
return None

info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None}
try:
pynvml.nvmlInit()
except Exception: # noqa
return None

try:
# 获取 NVIDIA 驱动版本信息
nv_driver = pynvml.nvmlSystemGetDriverVersion()
if isinstance(nv_driver, bytes):
nv_driver = nv_driver.decode("utf-8")
info["driver"] = nv_driver

# 获取 CUDA 版本
info["cuda"] = get_cuda_version()

# 获取 NVIDIA GPU 数量
info["cores"] = pynvml.nvmlDeviceGetCount()
# 遍历每个 GPU,获取 GPU 信息
for i in range(info["cores"]):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
# 获取 GPU 型号
gpu_name = pynvml.nvmlDeviceGetName(handle) # types: bytes | str
if isinstance(gpu_name, bytes): # Fix for pynvml 早期版本,关联 issue: #605
gpu_name = gpu_name.decode("utf-8")
info["type"].append(gpu_name)
# 获取 GPU 的总显存, 单位为GB
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3)))

except pynvml.NVMLError:
pass
finally:
# 结束 NVML
pynvml.nvmlShutdown()
return info


# ---------------------------------- apple信息 ----------------------------------


def get_apple_chip_info():
if "mac" not in platform.platform().lower():
return None
info = {"cpu": None, "gpu": None, "memory": None, "type": None}

# 使用system_profiler命令以JSON格式获取GPU信息
try:
result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True)
gpu_name = json.loads(result.stdout)["SPHardwareDataType"][0]["chip_type"]
memory = json.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"]
memory = str(memory).lower().replace("gb", "")
# TODO: 获取GPU信息
info["type"] = gpu_name
info["memory"] = memory
except Exception: # noqa
return None
try:
info["cpu"] = multiprocessing.cpu_count()
except Exception: # noqa
pass
return info
Loading