Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix L0_backend_vllm* jobs #62

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 47 additions & 128 deletions ci/L0_backend_vllm/metrics_test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,36 +49,42 @@ function copy_model_repository {
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
}

RET=0

# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
copy_model_repository
run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi
run_test() {
local TEST_CASE=$1

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1
set +e
python3 $CLIENT_PY $TEST_CASE -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
echo -e "\n***\n*** Running $CLIENT_PY $TEST_CASE FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
fi
set -e
set -e

kill $SERVER_PID
wait $SERVER_PID
kill $SERVER_PID
wait $SERVER_PID
}

RET=0

# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
copy_model_repository
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled

# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
copy_model_repository
Expand All @@ -90,33 +96,7 @@ parameters: {
}
}
" >> models/vllm_opt/config.pbtxt

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled

# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
copy_model_repository
Expand All @@ -129,33 +109,22 @@ parameters: {
}
}
" >> models/vllm_opt/config.pbtxt
run_test VLLMTritonMetricsTest.test_vllm_metrics

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
# Test vLLM metrics custom sampling parameters
# Custom sampling parameters may result in different vLLM output depending
# on the platform. Therefore, these metrics are tests separately.
copy_model_repository
cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
echo -e "
parameters: {
key: \"REPORT_CUSTOM_METRICS\"
value: {
string_value:\"yes\"
}
}
" >> models/vllm_opt/config.pbtxt
run_test VLLMTritonMetricsTest.test_custom_sampling_params

# Test enabling vLLM metrics reporting in config.pbtxt but disabling in model.json
copy_model_repository
Expand All @@ -169,33 +138,7 @@ parameters: {
}
}
" >> models/vllm_opt/config.pbtxt

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_disabled FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled

# Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option
copy_model_repository
Expand All @@ -208,32 +151,8 @@ parameters: {
}
" >> models/vllm_opt/config.pbtxt
SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false"
run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY VLLMTritonMetricsTest.test_vllm_metrics_refused FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e
run_test VLLMTritonMetricsTest.test_vllm_metrics_refused

kill $SERVER_PID
wait $SERVER_PID
rm -rf "./models" "temp.json"

if [ $RET -eq 1 ]; then
Expand Down
47 changes: 32 additions & 15 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,29 +112,25 @@ def vllm_infer(
self.triton_client.stop_stream()

def test_vllm_metrics(self):
# Adding sampling parameters for testing metrics.
# Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
n, best_of = 2, 4
custom_sampling_parameters = self.sampling_parameters.copy()
# Changing "temperature" because "best_of" must be 1 when using greedy
# sampling, i.e. "temperature": "0".
custom_sampling_parameters.update(
{"n": str(n), "best_of": str(best_of), "temperature": "1"}
)

# Test vLLM metrics
self.vllm_infer(
prompts=self.prompts,
sampling_parameters=custom_sampling_parameters,
sampling_parameters=self.sampling_parameters,
model_name=self.vllm_model_name,
)
metrics_dict = self.parse_vllm_metrics()
total_prompts = len(self.prompts)

# vllm:prompt_tokens_total
# (2, 133, 144, 2702, 3477, 16)
# (2, 133, 812, 9, 1470, 16)
# (2, 133, 499, 9, 4687, 16)
self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
# vllm:generation_tokens_total
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 188)
# (5, 65, 14, 16, 144, 533, 7, 28, 848, 30, 10, 512, 4, 50118, 100, 437)
# (5, 812, 9, 5, 1515, 3497, 4, 50118, 50118, 133, 812, 9, 1470, 16, 5, 812)
# (11, 5, 1420, 9, 5, 82, 4, 50118, 50118, 133, 499, 9, 4687, 16, 11, 5)
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please update comment above with the expected answer and explain why we check 48 . I would like to avoid magic numbers in the code, to speed up the debugging time in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't find a way to print generation tokens directly from vLLM engine. And the calculation of generation_tokens_total isn't straight forward (see https://github.com/vllm-project/vllm/blob/da1f7cc12a12ea4a744d26122e9a13ea4b3f4c7b/vllm/engine/llm_engine.py#L1086-L1088).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you define 48 as an expected number of tokens ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming vLLM reports the correct number of tokens in the current version (0.5.3-post1), this test makes sure the number stays consistent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Amazing, thanks for this update !

# vllm:time_to_first_token_seconds
self.assertEqual(
metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts
Expand Down Expand Up @@ -166,13 +162,34 @@ def test_vllm_metrics(self):
# vllm:request_generation_tokens
self.assertEqual(
metrics_dict["vllm:request_generation_tokens_count"],
best_of * total_prompts,
total_prompts,
)
self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 188)
self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 48)
self.assertEqual(
metrics_dict["vllm:request_generation_tokens_bucket"],
best_of * total_prompts,
total_prompts,
)

def test_custom_sampling_params(self):
# Adding sampling parameters for testing metrics.
# Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
n, best_of = 2, 4
custom_sampling_parameters = self.sampling_parameters.copy()
# Changing "temperature" because "best_of" must be 1 when using greedy
# sampling, i.e. "temperature": "0".
custom_sampling_parameters.update(
{"n": str(n), "best_of": str(best_of), "temperature": "1"}
)

# Test vLLM metrics
self.vllm_infer(
prompts=self.prompts,
sampling_parameters=custom_sampling_parameters,
model_name=self.vllm_model_name,
)
metrics_dict = self.parse_vllm_metrics()
total_prompts = len(self.prompts)

# vllm:request_params_best_of
self.assertEqual(
metrics_dict["vllm:request_params_best_of_count"], total_prompts
Expand Down
Loading