SwanHubX · SAKURA-CAT · Nov 24, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 swankit==0.1.1b3
-swanboard==0.1.4b2
+swanboard==0.1.6
 cos-python-sdk-v5
 urllib3>=1.26.0
 requests>=2.25.0

diff --git a/swanlab/data/callback_local.py b/swanlab/data/callback_local.py
@@ -7,18 +7,20 @@
 @Description:
     基本回调函数注册表，此时不考虑云端情况
 """
-from swankit.core import SwanLabSharedSettings
-from swanlab.log import swanlog
-from swanlab.data.run.main import get_run, SwanLabRunState
-from swanlab.data.run.callback import SwanLabRunCallback
-from swankit.callback import RuntimeInfo, MetricInfo
-from swankit.log import FONT
-from swanlab.env import SwanLabEnv
-from datetime import datetime
-import traceback
 import json
 import os
 import sys
+import traceback
+from datetime import datetime
+
+from swankit.callback import RuntimeInfo, MetricInfo
+from swankit.core import SwanLabSharedSettings
+from swankit.log import FONT
+
+from swanlab.data.run.callback import SwanLabRunCallback
+from swanlab.data.run.main import get_run, SwanLabRunState
+from swanlab.env import SwanLabEnv
+from swanlab.log import swanlog
 
 
 class LocalRunCallback(SwanLabRunCallback):
@@ -55,6 +57,7 @@ def _init_logdir(logdir: str = None) -> str:
         根据传入的logdir，初始化日志文件夹
         FIXME shit code
         """
+        env_key = SwanLabEnv.SWANLOG_FOLDER.value
         # 如果传入了logdir，则将logdir设置为环境变量，代表日志文件存放的路径
         if logdir is not None:
             try:
@@ -73,7 +76,6 @@ def _init_logdir(logdir: str = None) -> str:
                 raise ValueError("logdir must be a str.")
             except IOError:
                 raise IOError("logdir must be a path and have Write permission.")
-            os.environ[SwanLabEnv.SWANLOG_FOLDER.value] = logdir
         # 如果没有传入logdir，则使用默认的logdir, 即当前工作目录下的swanlog文件夹，但是需要保证目录存在
         else:
             logdir = os.environ.get(SwanLabEnv.SWANLOG_FOLDER.value) or os.path.join(os.getcwd(), "swanlog")
@@ -84,6 +86,8 @@ def _init_logdir(logdir: str = None) -> str:
                     raise IOError
             except IOError:
                 raise IOError("logdir must have Write permission.")
+        # 同步环境变量
+        os.environ[env_key] = logdir
         # 如果logdir是空的，创建.gitignore文件，写入*
         if not os.listdir(logdir):
             with open(os.path.join(logdir, ".gitignore"), "w", encoding="utf-8") as f:

diff --git a/swanlab/data/run/helper.py b/swanlab/data/run/helper.py
@@ -8,10 +8,11 @@
     回调函数操作员，批量处理回调函数的调用
 """
 from typing import List, Union, Dict, Any, Tuple
-from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, OperateErrorInfo, RuntimeInfo
+
+from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, RuntimeInfo
 from swankit.core import SwanLabSharedSettings
-import swanlab.error as E
-from swankit.log import FONT
+
+from swanlab.data.run.webhook import try_send_webhook
 
 OperatorReturnType = Dict[str, Any]
 
@@ -83,15 +84,8 @@ def before_init_experiment(
         return self.__run_all("before_init_experiment", run_id, exp_name, description, num, colors)
 
     def on_run(self):
-        try:
-            return self.__run_all("on_run")
-        except E.ApiError as e:
-            FONT.brush("", 50)
-            if e.resp.status_code == 409:
-                error = OperateErrorInfo("The experiment name already exists, please change the experiment name")
-                return self.__run_all("on_run_error_from_operator", error)
-            else:
-                raise e
+        self.__run_all("on_run")
+        try_send_webhook()
 
     def on_runtime_info_update(self, r: RuntimeInfo):
         return self.__run_all("on_runtime_info_update", r)

diff --git a/swanlab/data/run/main.py b/swanlab/data/run/main.py
@@ -22,8 +22,8 @@
 from .config import SwanLabConfig
 from .exp import SwanLabExp
 from .helper import SwanLabRunOperator, RuntimeInfo
+from .metadata import get_requirements, get_metadata
 from .public import SwanLabPublicConfig
-from .system import get_system_info, get_requirements
 from ..formater import check_key_format, check_exp_name_format, check_desc_format
 
 MAX_LIST_LENGTH = 108
@@ -134,7 +134,7 @@ def _(state: SwanLabRunState):
         self.__operator.on_runtime_info_update(
             RuntimeInfo(
                 requirements=get_requirements(),
-                metadata=get_system_info(get_package_version(), self.__settings.log_dir),
+                metadata=get_metadata(self.__settings.log_dir),
             )
         )
 

diff --git a/swanlab/data/run/metadata/__init__.py b/swanlab/data/run/metadata/__init__.py
@@ -0,0 +1,30 @@
+"""
+@author: cunyue
+@file: __init__.py
+@time: 2024/11/18 15:02
+@description: 实验元信息采集
+"""
+
+from swanlab.data.run.metadata.cooperation import get_cooperation_info
+from swanlab.data.run.metadata.hardware import get_hardware_info
+from swanlab.data.run.metadata.requirements import get_requirements
+from swanlab.data.run.metadata.runtime import get_runtime_info
+
+
+def get_metadata(logdir: str):
+    """
+    采集实验的全部信息
+    """
+    coop = get_cooperation_info()
+    return {
+        **get_hardware_info(),
+        **get_runtime_info(),
+        "swanlab": {
+            "version": coop["swanlab"]["version"],
+            "logdir": logdir,
+            "_coop": coop,
+        },
+    }
+
+
+__all__ = ["get_metadata", "get_requirements", "get_cooperation_info"]
diff --git a/swanlab/data/run/metadata/coop/qing_cloud.py b/swanlab/data/run/metadata/coop/qing_cloud.py
@@ -0,0 +1,36 @@
+"""
+@author: cunyue
+@file: qing_cloud.py
+@time: 2024/11/18 15:14
+@description: 青云(https://www.qingcloud.com/)元信息采集
+"""
+
+import os
+
+
+BASE_KEYS = ['AICP_PLATFORM', 'AICP_TYPE', 'AICP_NAME', 'AICP_USER_NAME']
+RESOURCES_KEYS = [
+    'AICP_SPEC_COUNT',
+    'AICP_SPEC_GPU',
+    'AICP_SPEC_CPU',
+    'AICP_SPEC_MEMORY',
+    'AICP_SPEC_GPU_NAME',
+    'AICP_SPEC_GPU_TYPE',
+    'AICP_SPEC_GPU_MEMORY',
+    'AICP_HOSTNAME',
+    'AICP_HOST_MACHINE',
+]
+
+
+def get_qing_cloud_info():
+    plat = os.getenv("AICP_PLATFORM")
+    if not plat:
+        return None
+    return {**get_envs_by_keys(BASE_KEYS), "resources": get_envs_by_keys(RESOURCES_KEYS)}
+
+
+def get_envs_by_keys(keys: list):
+    """
+    通过keys获取环境变量，最终返回一个dict，key为keys的值（小写），value为环境变量的值
+    """
+    return {key.lower(): os.getenv(key) for key in keys}
diff --git a/swanlab/data/run/metadata/cooperation.py b/swanlab/data/run/metadata/cooperation.py
@@ -0,0 +1,36 @@
+"""
+@author: cunyue
+@file: official.py
+@time: 2024/11/18 15:13
+@description: swanlab官方合作信息
+"""
+
+import os
+
+from swanlab.api import get_http
+from swanlab.data.run.metadata.coop.qing_cloud import get_qing_cloud_info
+from swanlab.env import SwanLabEnv
+from swanlab.package import get_experiment_url
+from swanlab.package import get_package_version
+
+
+def get_cooperation_info():
+    qing_cloud = get_qing_cloud_info()
+    coop = {"swanlab": get_swanlab_info()}
+    if qing_cloud:
+        coop.update({"qing_cloud": qing_cloud})
+    return coop
+
+
+def get_swanlab_info():
+    data = {
+        "version": get_package_version(),
+        "mode": os.getenv(SwanLabEnv.MODE.value),
+        "swanlog_dir": os.getenv(SwanLabEnv.SWANLOG_FOLDER.value),
+    }
+    try:
+        http = get_http()
+        data["exp_url"] = get_experiment_url(http.username, http.projname, http.exp_id)
+    except ValueError:
+        pass
+    return data
diff --git a/swanlab/data/run/metadata/hardware.py b/swanlab/data/run/metadata/hardware.py
@@ -0,0 +1,173 @@
+"""
+@author: cunyue
+@file: hardware.py
+@time: 2024/11/18 15:12
+@description: 硬件信息采集
+"""
+
+import json
+import multiprocessing
+import platform
+import subprocess
+
+import psutil
+import pynvml
+
+
+def get_hardware_info():
+    """
+    采集硬件信息，包括CPU、GPU、内存、硬盘等
+    """
+    info = {
+        "memory": get_memory_size(),
+        "cpu": get_cpu_info(),
+        "gpu": {
+            "nvidia": get_nvidia_gpu_info(),
+        },
+        "soc": {
+            "apple": get_apple_chip_info(),
+        },
+    }
+    return info
+
+
+# ---------------------------------- cpu信息 ----------------------------------
+
+
+def get_cpu_info():
+    """获取 CPU 信息"""
+    info = {"brand": None, "cores": None}
+
+    # 获取 CPU 品牌, 根据不同操作系统调用不同的函数
+    if platform.system() == "Windows":
+        info["brand"] = get_cpu_brand_windows()
+    elif platform.system() == "Linux":
+        info["brand"] = get_cpu_brand_linux()
+    else:
+        # 其他情况，暂时不支持
+        # 苹果芯片单独处理
+        return None
+    try:
+        # 获取 CPU 核心数
+        info["cores"] = multiprocessing.cpu_count()
+    except Exception:  # noqa
+        pass
+
+    return info
+
+
+def get_cpu_brand_windows():
+    try:
+        # 使用 WMIC 命令获取 CPU 品牌
+        result = subprocess.run(["wmic", "cpu", "get", "name"], capture_output=True, text=True)
+        cpu_brand = result.stdout.strip().split("\n")[-1].strip()
+        return cpu_brand
+    except Exception:  # noqa
+        return None
+
+
+def get_cpu_brand_linux():
+    try:
+        # 使用 lscpu 命令获取 CPU 品牌
+        result = subprocess.run(["lscpu"], capture_output=True, text=True)
+        for line in result.stdout.split("\n"):
+            if "Model name:" in line:
+                cpu_brand = line.split(":")[1].strip()
+                return cpu_brand
+        return None
+    except Exception:  # noqa
+        return None
+
+
+# ---------------------------------- 内存信息 ----------------------------------
+
+
+def get_memory_size():
+    """获取内存大小"""
+    try:
+        # 获取系统总内存大小
+        mem = psutil.virtual_memory()
+        total_memory = round(mem.total / (1024**3))  # 单位为GB
+        return total_memory
+    except Exception:  # noqa
+        return
+
+
+# ---------------------------------- gpu信息 ----------------------------------
+
+
+def get_nvidia_gpu_info():
+    """获取 GPU 信息"""
+
+    def get_cuda_version():
+        """获取 CUDA 版本"""
+        try:
+            output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
+            for line in output.split('\n'):
+                if "release" in line:
+                    version = line.split("release")[-1].strip().split(" ")[0][:-1]
+                    return version
+        except Exception:  # noqa
+            return None
+
+    info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None}
+    try:
+        pynvml.nvmlInit()
+    except Exception:  # noqa
+        return None
+
+    try:
+        # 获取 NVIDIA 驱动版本信息
+        nv_driver = pynvml.nvmlSystemGetDriverVersion()
+        if isinstance(nv_driver, bytes):
+            nv_driver = nv_driver.decode("utf-8")
+        info["driver"] = nv_driver
+
+        # 获取 CUDA 版本
+        info["cuda"] = get_cuda_version()
+
+        # 获取 NVIDIA GPU 数量
+        info["cores"] = pynvml.nvmlDeviceGetCount()
+        # 遍历每个 GPU，获取 GPU 信息
+        for i in range(info["cores"]):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            # 获取 GPU 型号
+            gpu_name = pynvml.nvmlDeviceGetName(handle)  # types: bytes | str
+            if isinstance(gpu_name, bytes):  # Fix for pynvml 早期版本，关联 issue: #605
+                gpu_name = gpu_name.decode("utf-8")
+            info["type"].append(gpu_name)
+            # 获取 GPU 的总显存, 单位为GB
+            info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3)))
+
+    except pynvml.NVMLError:
+        pass
+    finally:
+        # 结束 NVML
+        pynvml.nvmlShutdown()
+        return info
+
+
+# ---------------------------------- apple信息 ----------------------------------
+
+
+def get_apple_chip_info():
+    if "mac" not in platform.platform().lower():
+        return None
+    info = {"cpu": None, "gpu": None, "memory": None, "type": None}
+
+    # 使用system_profiler命令以JSON格式获取GPU信息
+    try:
+        result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True)
+        gpu_name = json.loads(result.stdout)["SPHardwareDataType"][0]["chip_type"]
+        memory = json.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"]
+        memory = str(memory).lower().replace("gb", "")
+        # TODO: 获取GPU信息
+        info["type"] = gpu_name
+        info["memory"] = memory
+    except Exception:  # noqa
+        return None
+    try:
+        info["cpu"] = multiprocessing.cpu_count()
+    except Exception:  # noqa
+        pass
+    return info