Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: TC for Metric P0 nv_load_time per model #7697

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
9 changes: 9 additions & 0 deletions docs/user_guide/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,15 @@ There are some places where a request would not be considered pending:
generally brief, it will not be considered pending from Triton's
perspective until Triton core has received the request from the frontend.

#### Load Time Per-Model
The *Model Load Duration* reflects the time to load a model from storage into GPU/CPU in seconds.
```
# HELP nv_model_load_duration_secs Model load time in seconds
# TYPE nv_model_load_duration_secs gauge
nv_model_load_duration_secs{model="input_all_optional",version="2"} 1.532738387
nv_model_load_duration_secs{model="input_all_optional",version="1"} 11.68753265
```

### Latencies

Starting in 23.04, Triton exposes the ability to choose the types of metrics
Expand Down
168 changes: 168 additions & 0 deletions qa/L0_metrics/general_metrics_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# /usr/bin/python
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import re
import time
import unittest

import requests

_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
MODEL_LOAD_TIME = "nv_model_load_duration_secs{model="


def get_model_load_times():
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
r.raise_for_status()
pattern = re.compile(rf'{MODEL_LOAD_TIME}"(.*?)".*?\ (\d+\.\d+)')

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable pattern is not used.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this variable used?

# Initialize an empty dictionary to store the data
model_data = {}
lines = r.text.strip().split("\n")
for line in lines:
# Use regex to extract model name, version, and load time
match = re.match(
r"nv_model_load_duration_secs\{model=\"(.*?)\",version=\"(.*?)\"\} (.*)",
line,
)
if match:
model_name = match.group(1)
model_version = match.group(2)
load_time = float(match.group(3))
# Store in dictionary
if model_name not in model_data:
model_data[model_name] = {}
model_data[model_name][model_version] = load_time
return model_data


def load_model_explicit(model_name, server_url="http://localhost:8000"):
endpoint = f"{server_url}/v2/repository/models/{model_name}/load"
response = requests.post(endpoint)

if response.status_code == 200:
print(f"Model '{model_name}' loaded successfully.")
else:
print(
f"Failed to load model '{model_name}'. Status code: {response.status_code}"
)
print("Response:", response.text)


def unload_model_explicit(model_name, server_url="http://localhost:8000"):
endpoint = f"{server_url}/v2/repository/models/{model_name}/unload"
response = requests.post(endpoint)

if response.status_code == 200:
print(f"Model '{model_name}' unloaded successfully.")
else:
print(
f"Failed to load model '{model_name}'. Status code: {response.status_code}"
)
print("Response:", response.text)


class TestGeneralMetrics(unittest.TestCase):
def setUp(self):
self.model_name = "libtorch_float32_float32_float32"
self.model_name_multiple_versions = "input_all_optional"

def test_metrics_load_time(self):
model_load_times = get_model_load_times()
load_time = model_load_times.get(self.model_name, {}).get("1")

self.assertIsNotNone(load_time, "Model Load time not found")

dict_size = len(model_load_times)
self.assertEqual(dict_size, 1, "Too many model_load_time entries found")

def test_metrics_load_time_explicit_load(self):
model_load_times = get_model_load_times()
load_time = model_load_times.get(self.model_name, {}).get("1")

self.assertIsNotNone(load_time, "Model Load time not found")

dict_size = len(model_load_times)
self.assertEqual(dict_size, 1, "Too many model_load_time entries found")

def test_metrics_load_time_explicit_unload(self):
model_load_times = get_model_load_times()
load_time = model_load_times.get(self.model_name, {}).get("1")
self.assertIsNone(load_time, "Model Load time found even after unload")

def test_metrics_load_time_multiple_version_reload(self):
# Part 1 load multiple versions of the same model and check if slow and fast models reflect the metric correctly
load_model_explicit(self.model_name_multiple_versions)
model_load_times = get_model_load_times()
load_time_slow = model_load_times.get(
self.model_name_multiple_versions, {}
).get("1")
load_time_fast = model_load_times.get(
self.model_name_multiple_versions, {}
).get("2")
# Fail the test if load_time_slow is less than load_time_fast
self.assertGreaterEqual(
load_time_slow,
load_time_fast,
"Slow load time should be greater than or equal to fast load time",
)
# Fail the test if load_time_slow is less than 10 seconds as manual delay is 10 seconds
self.assertGreaterEqual(
load_time_slow,
10,
"Slow load time should be greater than or equal to fast load time",
)

# Part 2 load multiple versions AGAIN and compare with prev values expect to be the same
# as triton does not actually load the model again.
load_model_explicit(self.model_name_multiple_versions)
model_load_times_new = get_model_load_times()
load_time_slow_new = model_load_times_new.get(
self.model_name_multiple_versions, {}
).get("1")
load_time_fast_new = model_load_times_new.get(
self.model_name_multiple_versions, {}
).get("2")
self.assertEqual(load_time_fast_new, load_time_fast)
self.assertEqual(load_time_slow_new, load_time_slow)

# Part 3 unload the model and expect the metrics to go away as model is not loaded now
unload_model_explicit(self.model_name_multiple_versions)
time.sleep(1)
model_load_times_new = get_model_load_times()
load_time_slow_new = model_load_times_new.get(
self.model_name_multiple_versions, {}
).get("1")
load_time_fast_new = model_load_times_new.get(
self.model_name_multiple_versions, {}
).get("2")
self.assertIsNone(load_time_slow_new, "Model Load time found even after unload")
self.assertIsNone(load_time_fast_new, "Model Load time found even after unload")


if __name__ == "__main__":
unittest.main()
43 changes: 42 additions & 1 deletion qa/L0_metrics/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ SERVER=${TRITON_DIR}/bin/tritonserver
BASE_SERVER_ARGS="--model-repository=${MODELDIR}"
SERVER_ARGS="${BASE_SERVER_ARGS}"
SERVER_LOG="./inference_server.log"
PYTHON_TEST="metrics_config_test.py"
source ../common/util.sh

CLIENT_LOG="client.log"
Expand Down Expand Up @@ -132,12 +131,54 @@ fi
kill_server
set -e

### General metrics tests

set +e
CLIENT_PY="./general_metrics_test.py"
CLIENT_LOG="general_metrics_test_client.log"
SERVER_LOG="general_metrics_test_server.log"
SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1"
PYTHON_TEST="general_metrics_test.py"
run_and_check_server
# Test 1 for default model control mode (all models loaded at startup)
python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1
kill_server

set +e
CLIENT_PY="./general_metrics_test.py"
CLIENT_LOG="general_metrics_test_client.log"
SERVER_LOG="general_metrics_test_server.log"
SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
run_and_check_server
MODEL_NAME='libtorch_float32_float32_float32'
code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load`
# Test 2 for explicit mode LOAD
python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1

code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload`
# Test 3 for explicit mode UNLOAD
python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
kill_server
Comment on lines +147 to +161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For test 2 and 3, I think it would be more helpful by testing the following model load/unload sequence against metrics:

  1. Start the server without loading any model.
  2. Check the metrics is empty.
  3. Load a few models (and have one model with two versions that loads at a different speed).
  4. Check the metrics is correct.
  5. Call the load API again (without changing the model repository).
  6. Check the metrics is unchanged.
  7. Load a new model.
  8. Check the metrics is updated correctly.
  9. Unload models.
  10. Check the metrics is unchanged.


# Test 4 for explicit mode LOAD and UNLOAD with multiple versions
set +e
CLIENT_PY="./general_metrics_test.py"
CLIENT_LOG="general_metrics_test_client.log"
SERVER_LOG="general_metrics_test_server.log"
VERSION_DIR="${PWD}/version_models"
SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1"
run_and_check_server
python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_multiple_version_reload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_multiple_version_reload >> $CLIENT_LOG 2>&1

kill_server

### Pinned memory metrics tests
set +e
CLIENT_PY="./pinned_memory_metrics_test.py"
CLIENT_LOG="pinned_memory_metrics_test_client.log"
SERVER_LOG="pinned_memory_metrics_test_server.log"
SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1"
PYTHON_TEST="metrics_config_test.py"
run_and_check_server
python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG}
check_unit_test
Expand Down
49 changes: 49 additions & 0 deletions qa/L0_metrics/version_models/input_all_optional/1/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import time

import numpy as np
import triton_python_backend_utils as pb_utils


class TritonPythonModel:
def initialize(self, args):
time.sleep(10)
self.model_config = json.loads(args["model_config"])

def execute(self, requests):
"""This function is called on inference request."""

responses = []
for _ in requests:
# Include one of each specially parsed JSON value: nan, inf, and -inf
out_0 = np.array([1], dtype=np.float32)
out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
responses.append(pb_utils.InferenceResponse([out_tensor_0]))

return responses
47 changes: 47 additions & 0 deletions qa/L0_metrics/version_models/input_all_optional/2/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json

import numpy as np
import triton_python_backend_utils as pb_utils


class TritonPythonModel:
def initialize(self, args):
self.model_config = json.loads(args["model_config"])

def execute(self, requests):
"""This function is called on inference request."""

responses = []
for _ in requests:
# Include one of each specially parsed JSON value: nan, inf, and -inf
out_0 = np.array([1], dtype=np.float32)
out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
responses.append(pb_utils.InferenceResponse([out_tensor_0]))

return responses
Loading
Loading