From 7dfa689da9b1a8e3b9fb108791df59aa91e2fdbf Mon Sep 17 00:00:00 2001 From: KepingYan Date: Thu, 6 Jun 2024 16:27:47 +0800 Subject: [PATCH 1/3] fix parameters mismatch --- benchmarks/run_benchmark.sh | 10 +++++----- docs/benchmark.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh index 188f5c415..efef16b3b 100644 --- a/benchmarks/run_benchmark.sh +++ b/benchmarks/run_benchmark.sh @@ -61,7 +61,7 @@ get_peak_throughpt(){ echo "RUN llm-on-ray with vllm" echo "RUN bs ${vllm_bs}" # server: - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $vllm_bs # client: $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm done @@ -80,7 +80,7 @@ metric_bs(){ echo "RUN llm-on-ray with vllm" echo "RUN bs ${vllm_bs}" # server: - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $vllm_bs # client: $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm done @@ -90,7 +90,7 @@ metric_bs(){ echo "RUN bs ${wo_vllm_bs}" bs_dir_wo_vllm=$choice_dir_wo_vllm"/bs_"$wo_vllm_bs # server: - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_concurrent_queries $wo_vllm_bs + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_ongoing_requests $wo_vllm_bs # client: $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --simple --results-dir $bs_dir_wo_vllm done @@ -107,7 +107,7 @@ latency_throughput(){ tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length # server - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $VALUE_INF # client for i in $(seq 1 $num_iter) @@ -133,7 +133,7 @@ get_best_latency(){ choice_dir=${4} # server - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $VALUE_INF # client for i in $(seq 1 $num_iter) diff --git a/docs/benchmark.md b/docs/benchmark.md index 7c35a144c..829267dd7 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -113,7 +113,7 @@ cpus_per_worker: 24 ``` 3. Deploy server ```cmd -OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_concurrent_queries 1000 --vllm_max_num_seqs 256 --simple +OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_ongoing_requests 1000 --max_num_seqs 256 --simple ``` 4. Send requests ```cmd From edf99a12f56da51a12baa16dfe980b2503ee77e3 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Thu, 6 Jun 2024 16:30:46 +0800 Subject: [PATCH 2/3] fix parameters mismatch --- benchmarks/run_benchmark.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh index efef16b3b..7cd0d07fb 100644 --- a/benchmarks/run_benchmark.sh +++ b/benchmarks/run_benchmark.sh @@ -61,7 +61,7 @@ get_peak_throughpt(){ echo "RUN llm-on-ray with vllm" echo "RUN bs ${vllm_bs}" # server: - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $vllm_bs + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs # client: $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm done @@ -80,7 +80,7 @@ metric_bs(){ echo "RUN llm-on-ray with vllm" echo "RUN bs ${vllm_bs}" # server: - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $vllm_bs + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs # client: $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm done @@ -107,7 +107,7 @@ latency_throughput(){ tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length # server - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $VALUE_INF + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF # client for i in $(seq 1 $num_iter) @@ -133,7 +133,7 @@ get_best_latency(){ choice_dir=${4} # server - $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --vllm_max_num_seqs $VALUE_INF + $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF # client for i in $(seq 1 $num_iter) From f4e7318b0490e30f6124b7c42a786713fa707b60 Mon Sep 17 00:00:00 2001 From: KepingYan Date: Fri, 7 Jun 2024 09:04:06 +0800 Subject: [PATCH 3/3] disable benchmark ci in pr --- .github/workflows/workflow_orders_on_pr.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml index 9c9ade519..acec72a6d 100644 --- a/.github/workflows/workflow_orders_on_pr.yml +++ b/.github/workflows/workflow_orders_on_pr.yml @@ -27,7 +27,3 @@ jobs: Finetune: needs: Lint uses: ./.github/workflows/workflow_finetune.yml - - Benchmark: - needs: Lint - uses: ./.github/workflows/workflow_test_benchmark.yml