From 78106db9596161ddd0a1277aff7ce3c42da26380 Mon Sep 17 00:00:00 2001
From: Wonbin Jin <116508975+wbjin@users.noreply.github.com>
Date: Tue, 10 Sep 2024 06:37:19 +0900
Subject: [PATCH] Docs: CPU energy measurement with RAPL  (#118)

Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
---
 docs/getting_started/index.md | 82 +++++++++++++++++++++++++++--------
 docs/measure/index.md         | 38 ++++++++++++++++
 zeus/monitor/energy.py        |  4 +-
 3 files changed, 105 insertions(+), 19 deletions(-)

diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md
index a4e84bbc..45da1a10 100644
--- a/docs/getting_started/index.md
+++ b/docs/getting_started/index.md
@@ -47,20 +47,21 @@ The default command would be:
 
 ``` { .sh .annotate }
 docker run -it \
-    --gpus all \                 # (1)!
-    --cap-add SYS_ADMIN \       # (2)!
-    --ipc host \               # (3)!
+    --gpus all \              # (1)!
+    --cap-add SYS_ADMIN \   # (2)!
+    --ipc host \          # (3)!
+    -v /sys/class/powercap/intel-rapl:/zeus_sys/class/powercap/intel-rapl \ # (4)!
     mlenergy/zeus:latest \
     bash
 ```
 
-1. Mounts all GPUs into the Docker container.
-2. `SYS_ADMIN` capability is needed to change the GPU's power limit or frequency. See [here](#system-privileges).
+1. Mounts all GPUs into the Docker container. See [Docker docs](https://docs.docker.com/engine/containers/resource_constraints/#expose-gpus-for-use) for more about the `--gpus` argument.
+2. The `SYS_ADMIN` Linux security capability is needed to change the GPU's power limit or frequency. See [here](#system-privileges) for details and alternatives.
 3. PyTorch DataLoader workers need enough shared memory for IPC. Without this, they may run out of shared memory and die.
+4. Zeus reads Intel RAPL metrics for CPU/DRAM energy measurement through the `sysfs` interface. Docker disables this by default, so we need to mount it into the container separately (under `/zeus_sys`).
 
-!!! Tip "Overriding Zeus installation"
-    Inside the container, `zeus`'s installation is editable (`pip install -e`).
-    So, you can mount your locally modified Zeus repository into the right path in the container (`-v /path/to/zeus:/workspace/zeus`), and your modifications will automatically be applied without you having to run `pip install` again.
+Especially, `--cap-add SYS_ADMIN` is to be able to change the GPU's power limit or frequency, and `-v /sys/class/powercap/intel-rapl:/zeus_sys/class/powercap/intel-rapl` is to be able to measure CPU/DRAM energy via Intel RAPL.
+See [System privileges](#system-privileges) for details.
 
 ### Pulling from Docker Hub
 
@@ -81,20 +82,64 @@ cd zeus
 docker build -t mlenergy/zeus:master --build-arg TARGETARCH=amd64 -f docker/Dockerfile .
 ```
 
-## System privileges
+## Verifying installation
+
+After installing the Zeus package, you can run the following to see whether packages and hardware are properly detected by Zeus.
+
+```console
+$ python -m zeus.show_env
+================================================================================
+
+Python version: 3.9.19
+
+================================================================================
+
+[2024-09-09 16:40:14,495] [zeus.utils.framework](framework.py:25) PyTorch with CUDA support is available.
+[2024-09-09 16:40:14,496] [zeus.utils.framework](framework.py:45) JAX is not available
+
+Package availability and versions:
+  Zeus: 0.10.0
+  PyTorch: 2.4.1+cu121
+  JAX: not available
+
+================================================================================
+
+[2024-09-09 16:40:14,512] [zeus.device.gpu.nvidia](nvidia.py:46) pynvml is available and initialized.
 
-!!! Important "Nevermind if you're just measuring GPU energy"
-    No special system-level privileges are needed if you are just measuring GPU time and energy.
-    However, when you're looking into optimizing energy and if that method requires changing the GPU's power limit or SM frequency, special system-level privileges are required.
+GPU availability:
+  GPU 0: NVIDIA A40
+
+================================================================================
+
+[2024-09-09 16:40:14,519] [zeus.device.cpu.rapl](rapl.py:136) RAPL is available.
+[2024-09-09 16:40:14,519] [RaplWraparoundTracker](rapl.py:82) Monitoring wrap around of /sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj
+[2024-09-09 16:40:14,528] [RaplWraparoundTracker](rapl.py:82) Monitoring wrap around of /sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:0/energy_uj
+[2024-09-09 16:40:14,533] [RaplWraparoundTracker](rapl.py:82) Monitoring wrap around of /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
+[2024-09-09 16:40:14,535] [RaplWraparoundTracker](rapl.py:82) Monitoring wrap around of /sys/class/powercap/intel-rapl/intel-rapl:1/intel-rapl:1:0/energy_uj
+
+CPU availability:
+  CPU 0:
+    CPU measurements available (/sys/class/powercap/intel-rapl/intel-rapl:0)
+    DRAM measurements available (/sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:0)
+  CPU 1:
+    CPU measurements available (/sys/class/powercap/intel-rapl/intel-rapl:1)
+    DRAM measurements available (/sys/class/powercap/intel-rapl/intel-rapl:1/intel-rapl:1:0)
+
+================================================================================
+```
+
+## System privileges
 
 ### When are extra system privileges needed?
 
-The Linux capability `SYS_ADMIN` is required in order to change the GPU's power limit or frequency.
-Specifically, this is needed by the [`GlobalPowerLimitOptimizer`][zeus.optimizer.power_limit.GlobalPowerLimitOptimizer] and the [`PipelineFrequencyOptimizer`][zeus.optimizer.pipeline_frequency.PipelineFrequencyOptimizer].
+1. **CPU energy measurement**: `root` privileges are needed when measuring CPU energy through the Intel RAPL interface. This is due to a [security issue](https://www.cve.org/CVERecord?id=CVE-2020-8694). Specifically, this is needed if you want to measure CPU energy via [`ZeusMonitor`][zeus.monitor.energy.ZeusMonitor] with `cpu_indices`.
+2. **GPU energy optimization**: The Linux security capability `SYS_ADMIN` (`root` is fine as well as it's stronger) is required in order to change the GPU's power limit or frequency. Specifically, this is needed by the [`GlobalPowerLimitOptimizer`][zeus.optimizer.power_limit.GlobalPowerLimitOptimizer] and the [`PipelineFrequencyOptimizer`][zeus.optimizer.pipeline_frequency.PipelineFrequencyOptimizer].
 
 ### Option 1: Running applications in a Docker container
 
-Using Docker, you can pass `--cap-add SYS_ADMIN` to `docker run`.
+For CPU energy measurement, you are `root` inside a Docker container. You will just need to mount the RAPL sysfs directory into the Docker container. See [here](#using-docker) for instructions.
+
+For GPU energy optimization, you can pass `--cap-add SYS_ADMIN` to `docker run`.
 Since this significantly simplifies running Zeus, we recommend users to consider this option first.
 This is also possible for Kubernetes Pods with `securityContext.capabilities.add` in container specs ([docs](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container){.external}).
 
@@ -112,17 +157,20 @@ cargo install zeusd
 
 # Run zeusd with admin privileges
 sudo zeusd \
-    --socket-path /var/run/zeusd.sock \  # (1)!
+    --socket-path /var/run/zeusd.sock \   # (1)!
     --socket-permissions 666            # (2)!
 ```
 
 1. Unix domain socket path that `zeusd` listens to.
 2. Applications need *write* access to the socket to be able to talk to `zeusd`. This string is interpreted as [UNIX file permissions](https://en.wikipedia.org/wiki/File-system_permissions#Numeric_notation).
 
+We're currently working on adding Intel RAPL support to the Zeus daemon ([tracking issue](https://github.com/ml-energy/zeus/issues/110)).
+We plan to land this feature at the end of 2024.
+
 ### Option 3: Running applications with `sudo`
 
 This is probably the worst option.
-However, if none of the options above work, you can run your application with `sudo`, which automatically has `SYS_ADMIN`.
+However, if none of the options above work, you can run your application with `sudo`, which is essentially `root` and automatically has `SYS_ADMIN`.
 
 ## Next Steps
 
diff --git a/docs/measure/index.md b/docs/measure/index.md
index 8f9252d7..10f02cfe 100644
--- a/docs/measure/index.md
+++ b/docs/measure/index.md
@@ -76,6 +76,44 @@ Depending on the Deep Learning framework you're using (currently PyTorch and JAX
     This is usually what you want, except when using more advanced device partitioning (e.g., using `--xla_force_host_platform_device_count` in JAX to partition CPUs into more pieces).
     In such cases, you probably want to opt out from using this function and handle synchronization manually at the appropriate granularity.
 
+## CPU measurements using Intel RAPL
+
+[`ZeusMonitor`][zeus.monitor.ZeusMonitor] supports CPU/DRAM energy measurement as well!
+
+The RAPL interface for CPU energy measurement is available for the majority of Intel and AMD CPUs.
+DRAM energy measurement are available on some CPUs as well.
+To check support, refer to [Verifying installation](../getting_started/index.md#verifying-installation).
+
+To only measure the energy consumption of the CPU used by the current Python process, you can use the [`get_current_cpu_index`][zeus.device.cpu.get_current_cpu_index] function, which retrieves the CPU index where the specified process ID is running.
+
+You can pass in `cpu_indices=[]` or `gpu_indices=[]` to [`ZeusMonitor`][zeus.monitor.ZeusMonitor] to disable either CPU or GPU measurements.
+
+```python hl_lines="2 5-7"
+from zeus.monitor import ZeusMonitor
+from zues.device.cpu import get_current_cpu_index
+
+if __name__ == "__main__":
+    # Get the CPU index of the current process
+    current_cpu_index = get_current_cpu_index()
+    monitor = ZeusMonitor(cpu_indices=[current_cpu_index], gpu_indices=[])
+
+    for epoch in range(100):
+        monitor.begin_window("epoch")
+
+        steps = []
+        for x, y in train_loader:
+            monitor.begin_window("step")
+            train_one_step(x, y)
+            result = monitor.end_window("step")
+            steps.append(result)
+
+        mes = monitor.end_window("epoch")
+        print(f"Epoch {epoch} consumed {mes.time} s and {mes.total_energy} J.")
+
+        avg_time = sum(map(lambda m: m.time, steps)) / len(steps)
+        avg_energy = sum(map(lambda m: m.total_energy, steps)) / len(steps)
+        print(f"One step takes {avg_time} s and {avg_energy} J for the CPU.")
+```
 
 ## CLI power and energy monitor
 
diff --git a/zeus/monitor/energy.py b/zeus/monitor/energy.py
index 846d26b3..d4037f73 100644
--- a/zeus/monitor/energy.py
+++ b/zeus/monitor/energy.py
@@ -271,7 +271,7 @@ def begin_window(self, key: str, sync_execution: bool = True) -> None:
             raise ValueError(f"Measurement window '{key}' already exists")
 
         # Synchronize execution (e.g., cudaSynchronize) to freeze at the right time.
-        if sync_execution:
+        if sync_execution and self.gpu_indices:
             sync_execution_fn(self.gpu_indices, sync_with=self.sync_with)
 
         # Freeze the start time of the profiling window.
@@ -337,7 +337,7 @@ def end_window(
         )
 
         # Synchronize execution (e.g., cudaSynchronize) to freeze at the right time.
-        if sync_execution:
+        if sync_execution and self.gpu_indices:
             sync_execution_fn(self.gpu_indices, sync_with=self.sync_with)
 
         # If the measurement window is cancelled, return an empty Measurement object.