Skip to content

Commit

Permalink
Fix formatting issues detected by black
Browse files Browse the repository at this point in the history
  • Loading branch information
sharonsyh committed Nov 30, 2024
1 parent 7e47fbb commit 52f2fa7
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 91 deletions.
162 changes: 84 additions & 78 deletions tests/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,63 +87,65 @@ def test_energy_histogram(mock_get_cpus, mock_zeus_monitor, mock_histogram):
prometheus_url=prometheus_url,
job="test_energy_histogram",
)
if histogram_metric.gpu_histograms:
for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items():
gpu_histogram.labels = MagicMock(return_value=gpu_histogram)
gpu_histogram.observe = MagicMock()

for _gpu_index, gpu_histogram in histogram_metric.gpu_histograms.items():
gpu_histogram.labels = MagicMock(return_value=gpu_histogram)
gpu_histogram.observe = MagicMock()
if histogram_metric.cpu_histograms:
for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items():
cpu_histogram.labels = MagicMock(return_value=cpu_histogram)
cpu_histogram.observe = MagicMock()

for _cpu_index, cpu_histogram in histogram_metric.cpu_histograms.items():
cpu_histogram.labels = MagicMock(return_value=cpu_histogram)
cpu_histogram.observe = MagicMock()

for _dram_index, dram_histogram in histogram_metric.dram_histograms.items():
dram_histogram.labels = MagicMock(return_value=dram_histogram)
dram_histogram.observe = MagicMock()
if histogram_metric.dram_histogram:
for _dram_index, dram_histogram in histogram_metric.dram_histograms.items():
dram_histogram.labels = MagicMock(return_value=dram_histogram)
dram_histogram.observe = MagicMock()

histogram_metric.begin_window("test_window")
histogram_metric.end_window("test_window")

# Assert GPU histograms were observed
for (
gpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items():
calls = [
call[0][0]
for call in histogram_metric.gpu_histograms[
gpu_index
].observe.call_args_list
]
print(f"Observed calls for GPU {gpu_index}: {calls}")
assert energy in calls, f"Expected {energy} in {calls}"
if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy:
for (
gpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items():
calls = [
call[0][0]
for call in histogram_metric.gpu_histograms[
gpu_index
].observe.call_args_list
]
assert energy in calls, f"Expected {energy} in {calls}"

# Assert CPU histograms were observed
for (
cpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items():
calls = [
call[0][0]
for call in histogram_metric.cpu_histograms[
cpu_index
].observe.call_args_list
]
print(f"Observed CPU calls for CPU {cpu_index}: {calls}")
assert energy in calls, f"Expected CPU energy {energy} in {calls}"
if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy:
for (
cpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items():
calls = [
call[0][0]
for call in histogram_metric.cpu_histograms[
cpu_index
].observe.call_args_list
]
assert energy in calls, f"Expected CPU energy {energy} in {calls}"

# Assert DRAM histograms were observed
for (
dram_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items():
calls = [
call[0][0]
for call in histogram_metric.dram_histograms[
dram_index
].observe.call_args_list
]
print(f"Observed DRAM calls for CPU {dram_index}: {calls}")
assert energy in calls, f"Expected DRAM energy {energy} in {calls}"
if mock_zeus_monitor.return_value.end_window.return_value.dram_energy:
for (
dram_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.dram_energy.items():
calls = [
call[0][0]
for call in histogram_metric.dram_histograms[
dram_index
].observe.call_args_list
]
assert energy in calls, f"Expected DRAM energy {energy} in {calls}"


def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor):
Expand Down Expand Up @@ -177,24 +179,26 @@ def test_energy_cumulative_counter(mock_get_cpus, mock_zeus_monitor):
cumulative_counter.end_window("test_counter")

# Assert GPU counters
for (
gpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items():
assert (
gpu_index in cumulative_counter.gpu_counters
), f"GPU counter for index {gpu_index} not initialized"
cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy)
if mock_zeus_monitor.return_value.end_window.return_value.gpu_energy:
for (
gpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.gpu_energy.items():
assert (
gpu_index in cumulative_counter.gpu_counters
), f"GPU counter for index {gpu_index} not initialized"
cumulative_counter.gpu_counters[gpu_index].inc.assert_called_with(energy)

# Assert CPU counters
for (
cpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items():
assert (
cpu_index in cumulative_counter.cpu_counters
), f"CPU counter for index {cpu_index} not initialized"
cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy)
if mock_zeus_monitor.return_value.end_window.return_value.cpu_energy:
for (
cpu_index,
energy,
) in mock_zeus_monitor.return_value.end_window.return_value.cpu_energy.items():
assert (
cpu_index in cumulative_counter.cpu_counters
), f"CPU counter for index {cpu_index} not initialized"
cumulative_counter.cpu_counters[cpu_index].inc.assert_called_with(energy)


@patch("zeus.device.gpu.get_gpus")
Expand Down Expand Up @@ -225,24 +229,26 @@ def test_power_gauge(
prometheus_url=prometheus_url,
job="test_power_gauge",
)
for _gpu_index, gauge in power_gauge.gpu_gauges.items():
gauge.labels = MagicMock(return_value=gauge)
gauge.set = MagicMock()
if power_gauge.gpu_gauges:
for _gpu_index, gauge in power_gauge.gpu_gauges.items():
gauge.labels = MagicMock(return_value=gauge)
gauge.set = MagicMock()

power_gauge.begin_window("test_power_window")
power_gauge.end_window("test_power_window")

# Assert that the gauges were set with the correct power values
for (
gpu_index,
power_value,
) in mock_power_monitor.return_value.get_power.return_value.items():
try:
# Check if `labels` was called with the correct arguments
power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with(
gpu_index=gpu_index, window="test_power_window"
)
power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value)
except AssertionError as e:
print(f"AssertionError for GPU {gpu_index}:")
raise e
if mock_power_monitor.return_value.get_power.return_value:
for (
gpu_index,
power_value,
) in mock_power_monitor.return_value.get_power.return_value.items():
try:
# Check if `labels` was called with the correct arguments
power_gauge.gpu_gauges[gpu_index].labels.assert_called_once_with(
gpu_index=gpu_index, window="test_power_window"
)
power_gauge.gpu_gauges[gpu_index].set.assert_called_once_with(power_value)
except AssertionError as e:
print(f"AssertionError for GPU {gpu_index}:")
raise e
30 changes: 17 additions & 13 deletions zeus/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def __init__(
gpu_indices: list,
prometheus_url: str,
job: str,
gpu_bucket_range: list[float] = None,
cpu_bucket_range: list[float] = None,
dram_bucket_range: list[float] = None,
gpu_bucket_range: list[float] | None,
cpu_bucket_range: list[float] | None,
dram_bucket_range: list[float] | None,
) -> None:
"""Initialize the EnergyHistogram class.
Expand Down Expand Up @@ -305,9 +305,10 @@ def end_window(self, name: str) -> None:
"Make sure 'begin_window' is called before 'end_window'."
)
self.queue.put("stop")
self.proc.join(timeout=20)
if self.proc.is_alive():
self.proc.terminate()
if self.proc is not None:
self.proc.join(timeout=20)
if self.proc.is_alive():
self.proc.terminate()


def energy_monitoring_loop(
Expand Down Expand Up @@ -470,9 +471,11 @@ def end_window(self, name: str) -> None:
name (str): The unique name of the measurement window. Must match between calls to 'begin_window' and 'end_window'.
"""
self.queue.put("stop")
self.proc.join(timeout=20)
if self.proc.is_alive():
self.proc.terminate()
if self.proc is not None:
self.proc.join(timeout=20)
if self.proc.is_alive():
warnings.warn(f"Forcefully terminating monitoring process for {name}.", stacklevel=2)
self.proc.terminate()


def power_monitoring_loop(
Expand Down Expand Up @@ -512,10 +515,11 @@ def power_monitoring_loop(
power_measurement = power_monitor.get_power()

try:
for gpu_index, power_value in power_measurement.items():
gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(
power_value
)
if power_measurement:
for gpu_index, power_value in power_measurement.items():
gpu_gauges[gpu_index].labels(gpu_index=f"{name}_gpu{gpu_index}").set(
power_value
)
except Exception as e:
print(f"Error during processing power measurement: {e}")

Expand Down

0 comments on commit 52f2fa7

Please sign in to comment.