diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py index aa194bfc..34299e7d 100644 --- a/zeus/device/gpu/amd.py +++ b/zeus/device/gpu/amd.py @@ -233,6 +233,13 @@ def getInstantPowerUsage(self) -> int: amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000 ) + @_handle_amdsmi_errors + def getAverageMemoryPowerUsage(self) -> int: + """Return the average power usage of the GPU's memory. Units: mW.""" + raise gpu_common.ZeusGPUNotSupportedError( + "Average memory power usage is not supported on AMD GPUs." + ) + @_handle_amdsmi_errors def supportsGetTotalEnergyConsumption(self) -> bool: """Check if the GPU supports retrieving total energy consumption.""" diff --git a/zeus/device/gpu/common.py b/zeus/device/gpu/common.py index 8571fd24..a84c05b7 100644 --- a/zeus/device/gpu/common.py +++ b/zeus/device/gpu/common.py @@ -101,6 +101,11 @@ def getInstantPowerUsage(self) -> int: """Return the current power draw of the GPU. Units: mW.""" pass + @abc.abstractmethod + def getAverageMemoryPowerUsage(self) -> int: + """Return the average power usage of the GPU's memory. Units: mW.""" + pass + @abc.abstractmethod def supportsGetTotalEnergyConsumption(self) -> bool: """Check if the GPU supports retrieving total energy consumption.""" diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py index c5872f3f..134696de 100644 --- a/zeus/device/gpu/nvidia.py +++ b/zeus/device/gpu/nvidia.py @@ -2,8 +2,9 @@ from __future__ import annotations -import functools import os +import warnings +import functools import contextlib from pathlib import Path from typing import Sequence @@ -196,7 +197,31 @@ def getInstantPowerUsage(self) -> int: )[0] if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS: raise pynvml.NVMLError(ret) - return metric.value.siVal + return metric.value.uiVal + + @_handle_nvml_errors + def getAverageMemoryPowerUsage(self) -> int: + """Return the average power draw of the GPU's memory. Units: mW. + + !!! Warning + This isn't exactly documented in NVML at the time of writing, but `nvidia-smi` + makes use of this API. + + Confirmed working on H100 80GB HBM3. Confirmed not working on A40. + """ + metric = pynvml.nvmlDeviceGetFieldValues( + self.handle, + [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)], + )[0] + if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS: + raise pynvml.NVMLError(ret) + power = metric.value.uiVal + if power == 0: + warnings.warn( + "Average memory power returned 0. The current GPU may not be supported.", + stacklevel=1, + ) + return power @_handle_nvml_errors def supportsGetTotalEnergyConsumption(self) -> bool: