Feat: getAverageMemoryPowerUsage (#122)

ml-energy · Sep 10, 2024 · fcd6abc · fcd6abc
1 parent 422c0d2
commit fcd6abc
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 2 deletions.
diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py
@@ -233,6 +233,13 @@ def getInstantPowerUsage(self) -> int:
             amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000
         )
 
+    @_handle_amdsmi_errors
+    def getAverageMemoryPowerUsage(self) -> int:
+        """Return the average power usage of the GPU's memory. Units: mW."""
+        raise gpu_common.ZeusGPUNotSupportedError(
+            "Average memory power usage is not supported on AMD GPUs."
+        )
+
     @_handle_amdsmi_errors
     def supportsGetTotalEnergyConsumption(self) -> bool:
         """Check if the GPU supports retrieving total energy consumption."""

diff --git a/zeus/device/gpu/common.py b/zeus/device/gpu/common.py
@@ -101,6 +101,11 @@ def getInstantPowerUsage(self) -> int:
         """Return the current power draw of the GPU. Units: mW."""
         pass
 
+    @abc.abstractmethod
+    def getAverageMemoryPowerUsage(self) -> int:
+        """Return the average power usage of the GPU's memory. Units: mW."""
+        pass
+
     @abc.abstractmethod
     def supportsGetTotalEnergyConsumption(self) -> bool:
         """Check if the GPU supports retrieving total energy consumption."""

diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py
@@ -2,8 +2,9 @@
 
 from __future__ import annotations
 
-import functools
 import os
+import warnings
+import functools
 import contextlib
 from pathlib import Path
 from typing import Sequence
@@ -196,7 +197,31 @@ def getInstantPowerUsage(self) -> int:
         )[0]
         if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
             raise pynvml.NVMLError(ret)
-        return metric.value.siVal
+        return metric.value.uiVal
+
+    @_handle_nvml_errors
+    def getAverageMemoryPowerUsage(self) -> int:
+        """Return the average power draw of the GPU's memory. Units: mW.
+
+        !!! Warning
+            This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
+            makes use of this API.
+
+            Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
+        """
+        metric = pynvml.nvmlDeviceGetFieldValues(
+            self.handle,
+            [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
+        )[0]
+        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
+            raise pynvml.NVMLError(ret)
+        power = metric.value.uiVal
+        if power == 0:
+            warnings.warn(
+                "Average memory power returned 0. The current GPU may not be supported.",
+                stacklevel=1,
+            )
+        return power
 
     @_handle_nvml_errors
     def supportsGetTotalEnergyConsumption(self) -> bool: