triton-inference-server · yinggeh · Sep 4, 2024 · Aug 30, 2024 · Sep 3, 2024 · Sep 4, 2024
diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh
@@ -49,36 +49,42 @@ function copy_model_repository {
     sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
 }
 
-RET=0
-
-# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
-copy_model_repository
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    exit 1
-fi
+run_test() {
+    local TEST_CASE=$1
+
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        cat $SERVER_LOG
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        exit 1
+    fi
 
-set +e
-python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1
+    set +e
+    python3 $CLIENT_PY $TEST_CASE -v > $CLIENT_LOG 2>&1
 
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
-    RET=1
-else
-    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
     if [ $? -ne 0 ]; then
         cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
+        echo -e "\n***\n*** Running $CLIENT_PY $TEST_CASE FAILED. \n***"
         RET=1
+    else
+        check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+        if [ $? -ne 0 ]; then
+            cat $CLIENT_LOG
+            echo -e "\n***\n*** Test Result Verification FAILED.\n***"
+            RET=1
+        fi
     fi
-fi
-set -e
+    set -e
 
-kill $SERVER_PID
-wait $SERVER_PID
+    kill $SERVER_PID
+    wait $SERVER_PID
+}
+
+RET=0
+
+# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
+copy_model_repository
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
 
 # Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
 copy_model_repository
@@ -90,33 +96,7 @@ parameters: {
   }
 }
 " >> models/vllm_opt/config.pbtxt
-
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    exit 1
-fi
-
-set +e
-python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1
-
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
-    RET=1
-else
-    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
-    if [ $? -ne 0 ]; then
-        cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
-        RET=1
-    fi
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
 
 # Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
 copy_model_repository
@@ -129,33 +109,22 @@ parameters: {
   }
 }
 " >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_vllm_metrics
 
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    exit 1
-fi
-
-set +e
-python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1
-
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***"
-    RET=1
-else
-    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
-    if [ $? -ne 0 ]; then
-        cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
-        RET=1
-    fi
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
+# Test vLLM metrics custom sampling parameters
+# Custom sampling parameters may result in different vLLM output depending
+# on the platform. Therefore, these metrics are tests separately.
+copy_model_repository
+cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
+echo -e "
+parameters: {
+  key: \"REPORT_CUSTOM_METRICS\"
+  value: {
+    string_value:\"yes\"
+  }
+}
+" >> models/vllm_opt/config.pbtxt
+run_test VLLMTritonMetricsTest.test_custom_sampling_params
 
 # Test enabling vLLM metrics reporting in config.pbtxt but disabling in model.json
 copy_model_repository
@@ -169,33 +138,7 @@ parameters: {
   }
 }
 " >> models/vllm_opt/config.pbtxt
-
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    exit 1
-fi
-
-set +e
-python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1
-
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
-    RET=1
-else
-    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
-    if [ $? -ne 0 ]; then
-        cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
-        RET=1
-    fi
-fi
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
+run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
 
 # Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option
 copy_model_repository
@@ -208,32 +151,8 @@ parameters: {
 }
 " >> models/vllm_opt/config.pbtxt
 SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    exit 1
-fi
-
-set +e
-python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused -v > $CLIENT_LOG 2>&1
-
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused FAILED. \n***"
-    RET=1
-else
-    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
-    if [ $? -ne 0 ]; then
-        cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification FAILED.\n***"
-        RET=1
-    fi
-fi
-set -e
+run_test VLLMTritonMetricsTest.test_vllm_metrics_refused
 
-kill $SERVER_PID
-wait $SERVER_PID
 rm -rf "./models" "temp.json"
 
 if [ $RET -eq 1 ]; then

diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
@@ -112,29 +112,25 @@ def vllm_infer(
         self.triton_client.stop_stream()
 
     def test_vllm_metrics(self):
-        # Adding sampling parameters for testing metrics.
-        # Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
-        n, best_of = 2, 4
-        custom_sampling_parameters = self.sampling_parameters.copy()
-        # Changing "temperature" because "best_of" must be 1 when using greedy
-        # sampling, i.e. "temperature": "0".
-        custom_sampling_parameters.update(
-            {"n": str(n), "best_of": str(best_of), "temperature": "1"}
-        )
-
         # Test vLLM metrics
         self.vllm_infer(
             prompts=self.prompts,
-            sampling_parameters=custom_sampling_parameters,
+            sampling_parameters=self.sampling_parameters,
             model_name=self.vllm_model_name,
         )
         metrics_dict = self.parse_vllm_metrics()
         total_prompts = len(self.prompts)
 
         # vllm:prompt_tokens_total
+        # (2, 133, 144, 2702, 3477, 16)
+        # (2, 133, 812, 9, 1470, 16)
+        # (2, 133, 499, 9, 4687, 16)
         self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
         # vllm:generation_tokens_total
-        self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 188)
+        # (5, 65, 14, 16, 144, 533, 7, 28, 848, 30, 10, 512, 4, 50118, 100, 437)
+        # (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812)
+        # (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5)
+        self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
         # vllm:time_to_first_token_seconds
         self.assertEqual(
             metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts
@@ -166,13 +162,34 @@ def test_vllm_metrics(self):
         # vllm:request_generation_tokens
         self.assertEqual(
             metrics_dict["vllm:request_generation_tokens_count"],
-            best_of * total_prompts,
+            total_prompts,
         )
-        self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 188)
+        self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 48)
         self.assertEqual(
             metrics_dict["vllm:request_generation_tokens_bucket"],
-            best_of * total_prompts,
+            total_prompts,
+        )
+
+    def test_custom_sampling_params(self):
+        # Adding sampling parameters for testing metrics.
+        # Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
+        n, best_of = 2, 4
+        custom_sampling_parameters = self.sampling_parameters.copy()
+        # Changing "temperature" because "best_of" must be 1 when using greedy
+        # sampling, i.e. "temperature": "0".
+        custom_sampling_parameters.update(
+            {"n": str(n), "best_of": str(best_of), "temperature": "1"}
+        )
+
+        # Test vLLM metrics
+        self.vllm_infer(
+            prompts=self.prompts,
+            sampling_parameters=custom_sampling_parameters,
+            model_name=self.vllm_model_name,
         )
+        metrics_dict = self.parse_vllm_metrics()
+        total_prompts = len(self.prompts)
+
         # vllm:request_params_best_of
         self.assertEqual(
             metrics_dict["vllm:request_params_best_of_count"], total_prompts