Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward merge v0.10 => v0.11 #191

Merged
merged 4 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/ops-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This file controls which features from the `ops-bot` repository below are enabled.
# - https://github.com/rapidsai/ops-bot

forward_merger: true
2 changes: 1 addition & 1 deletion ci/check_style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rapids-dependency-file-generator \
--file_key checks \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml

rapids-mamba-retry env create --force -f env.yaml -n checks
rapids-mamba-retry env create --yes -f env.yaml -n checks
conda activate checks

rapids-logger "Run pre-commit checks - Python backend"
Expand Down
4 changes: 2 additions & 2 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ rapids-dependency-file-generator \
--file_key test_python \
--matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml

rapids-mamba-retry env create --force -f env.yaml -n test
rapids-mamba-retry env create --yes -f env.yaml -n test

# Temporarily allow unbound variables for conda activation.
set +u
conda activate test
set -u

# rapids-logger "Downloading artifacts from previous jobs"
rapids-logger "Downloading artifacts from previous jobs"
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)

rapids-print-env
Expand Down
2 changes: 2 additions & 0 deletions conda/environments/all_arch-any.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ dependencies:
- psutil
- pynvml
- pytest
- pytest-asyncio
- pytest-jupyter[server]>=0.6.0
- python>=3.8
- websockets
name: all_arch-any
2 changes: 2 additions & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,5 @@ dependencies:
packages:
- pytest
- pytest-jupyter[server]>=0.6.0
- pytest-asyncio
- websockets
11 changes: 4 additions & 7 deletions jupyterlab_nvdashboard/apps/cpu.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import json
import psutil
import time
import tornado
from jupyter_server.base.handlers import APIHandler
from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler


class CPUResourceHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class CPUResourceWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
now = time.time()
stats = {
"time": now * 1000,
Expand All @@ -18,5 +16,4 @@ def get(self):
"network_read": psutil.net_io_counters().bytes_recv,
"network_write": psutil.net_io_counters().bytes_sent,
}
self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))
41 changes: 16 additions & 25 deletions jupyterlab_nvdashboard/apps/gpu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import json
from jupyterlab_nvdashboard.apps.utils import CustomWebSocketHandler
import pynvml
import time
import tornado
from jupyter_server.base.handlers import APIHandler

try:
pynvml.nvmlInit()
Expand Down Expand Up @@ -41,19 +40,17 @@
pci_gen = None


class GPUUtilizationHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUUtilizationWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
gpu_utilization = [
pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu
for i in range(ngpus)
]
self.finish(json.dumps({"gpu_utilization": gpu_utilization}))
self.write_message(json.dumps({"gpu_utilization": gpu_utilization}))


class GPUUsageHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUUsageWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
memory_usage = [
pynvml.nvmlDeviceGetMemoryInfo(handle).used
for handle in gpu_handles
Expand All @@ -64,16 +61,15 @@ def get(self):
for handle in gpu_handles
]

self.finish(
self.write_message(
json.dumps(
{"memory_usage": memory_usage, "total_memory": total_memory}
)
)


class GPUResourceHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class GPUResourceWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
now = time.time()
stats = {
"time": now * 1000,
Expand Down Expand Up @@ -118,15 +114,13 @@ def get(self):
stats["gpu_memory_total"] = round(
(stats["gpu_memory_total"] / gpu_mem_sum) * 100, 2
)
self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))


class NVLinkThroughputHandler(APIHandler):
class NVLinkThroughputWebSocketHandler(CustomWebSocketHandler):
prev_throughput = None

@tornado.web.authenticated
def get(self):
def send_data(self):
throughput = [
pynvml.nvmlDeviceGetFieldValues(
handle,
Expand Down Expand Up @@ -162,9 +156,8 @@ def get(self):
# Store the current throughput for the next request
self.prev_throughput = throughput

self.set_header("Content-Type", "application/json")
# Send the change in throughput as part of the response
self.write(
self.write_message(
json.dumps(
{
"nvlink_rx": [
Expand All @@ -191,9 +184,8 @@ def get(self):
)


class PCIStatsHandler(APIHandler):
@tornado.web.authenticated
def get(self):
class PCIStatsWebSocketHandler(CustomWebSocketHandler):
def send_data(self):
# Use device-0 to get "upper bound"
pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0])
pci_bw = {
Expand Down Expand Up @@ -231,5 +223,4 @@ def get(self):
"max_rxtx_tp": max_rxtx_tp,
}

self.set_header("Content-Type", "application/json")
self.write(json.dumps(stats))
self.write_message(json.dumps(stats))
31 changes: 31 additions & 0 deletions jupyterlab_nvdashboard/apps/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from tornado.websocket import WebSocketHandler
import tornado
import json


class CustomWebSocketHandler(WebSocketHandler):
def open(self):
self.write_message(json.dumps({"status": "connected"}))
self.set_nodelay(True)
# Start a periodic callback to send data every 50ms
self.callback = tornado.ioloop.PeriodicCallback(self.send_data, 1000)
self.callback.start()

def on_message(self, message):
message_data = json.loads(message)
# Update the periodic callback frequency
new_frequency = message_data["updateFrequency"]
if hasattr(self, "callback"):
self.callback.stop()
self.callback = tornado.ioloop.PeriodicCallback(
self.send_data, new_frequency
)
if not message_data["isPaused"]:
self.callback.start()

def on_close(self):
if hasattr(self, "callback") and self.callback.is_running():
self.callback.stop()

def send_data(self):
pass
12 changes: 6 additions & 6 deletions jupyterlab_nvdashboard/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ def setup_handlers(web_app):
base_url, URL_PATH, "nvlink_throughput"
)
handlers += [
(route_pattern_gpu_util, apps.gpu.GPUUtilizationHandler),
(route_pattern_gpu_usage, apps.gpu.GPUUsageHandler),
(route_pattern_gpu_resource, apps.gpu.GPUResourceHandler),
(route_pattern_pci_stats, apps.gpu.PCIStatsHandler),
(route_pattern_gpu_util, apps.gpu.GPUUtilizationWebSocketHandler),
(route_pattern_gpu_usage, apps.gpu.GPUUsageWebSocketHandler),
(route_pattern_gpu_resource, apps.gpu.GPUResourceWebSocketHandler),
(route_pattern_pci_stats, apps.gpu.PCIStatsWebSocketHandler),
(
route_pattern_nvlink_throughput,
apps.gpu.NVLinkThroughputHandler,
apps.gpu.NVLinkThroughputWebSocketHandler,
),
]

Expand All @@ -41,7 +41,7 @@ def setup_handlers(web_app):
)

handlers += [
(route_pattern_cpu_resource, apps.cpu.CPUResourceHandler),
(route_pattern_cpu_resource, apps.cpu.CPUResourceWebSocketHandler),
]

web_app.add_handlers(host_pattern, handlers)
2 changes: 2 additions & 0 deletions jupyterlab_nvdashboard/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def pytest_configure(config):
config.addinivalue_line("markers", "asyncio: mark test as asyncio")
37 changes: 37 additions & 0 deletions jupyterlab_nvdashboard/tests/test_cpu_handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json
import pytest
from unittest.mock import MagicMock, patch

from jupyterlab_nvdashboard.apps.cpu import CPUResourceWebSocketHandler


@pytest.fixture
def mock_handler(monkeypatch):
mock = MagicMock()
monkeypatch.setattr(
"jupyterlab_nvdashboard.apps.cpu.CustomWebSocketHandler.write_message",
mock,
)
return mock


@pytest.fixture
def handler_args():
with patch("tornado.web.Application") as mock_application, patch(
"tornado.httputil.HTTPServerRequest"
) as mock_request:
yield mock_application, mock_request


def test_cpu_resource_handler(mock_handler, handler_args):
handler = CPUResourceWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "time" in data
assert "cpu_utilization" in data
assert "memory_usage" in data
assert "disk_read" in data
assert "disk_write" in data
assert "network_read" in data
assert "network_write" in data
80 changes: 80 additions & 0 deletions jupyterlab_nvdashboard/tests/test_gpu_handlers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
import pytest
from unittest.mock import MagicMock, patch

from jupyterlab_nvdashboard.apps.gpu import (
GPUUtilizationWebSocketHandler,
GPUUsageWebSocketHandler,
GPUResourceWebSocketHandler,
NVLinkThroughputWebSocketHandler,
PCIStatsWebSocketHandler,
)


@pytest.fixture
def mock_handler(monkeypatch):
mock = MagicMock()
monkeypatch.setattr(
"jupyterlab_nvdashboard.apps.gpu.CustomWebSocketHandler.write_message",
mock,
)
return mock


@pytest.fixture
def handler_args():
with patch("tornado.web.Application") as mock_application, patch(
"tornado.httputil.HTTPServerRequest"
) as mock_request:
yield mock_application, mock_request


def test_gpu_utilization_handler(mock_handler, handler_args):
handler = GPUUtilizationWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "gpu_utilization" in data


def test_gpu_usage_handler(mock_handler, handler_args):
handler = GPUUsageWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "memory_usage" in data
assert "total_memory" in data


def test_gpu_resource_handler(mock_handler, handler_args):
handler = GPUResourceWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "time" in data
assert "gpu_utilization_total" in data
assert "gpu_memory_total" in data
assert "rx_total" in data
assert "tx_total" in data
assert "gpu_memory_individual" in data
assert "gpu_utilization_individual" in data


def test_nvlink_throughput_handler(mock_handler, handler_args):
handler = NVLinkThroughputWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "nvlink_rx" in data
assert "nvlink_tx" in data
assert "max_rxtx_bw" in data


def test_pci_stats_handler(mock_handler, handler_args):
handler = PCIStatsWebSocketHandler(*handler_args)
handler.send_data()
args, _ = mock_handler.call_args
data = json.loads(args[0])
assert "pci_tx" in data
assert "pci_rx" in data
assert "max_rxtx_tp" in data
Loading
Loading