intel · carsonwang · Jun 7, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 7, 2024
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
@@ -27,7 +27,3 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
-
-  Benchmark:
-    needs: Lint
-    uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -61,7 +61,7 @@ get_peak_throughpt(){
         echo "RUN llm-on-ray with vllm"
         echo "RUN bs ${vllm_bs}"
         # server:
-        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs
         # client:
         $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
     done
@@ -80,7 +80,7 @@ metric_bs(){
         echo "RUN llm-on-ray with vllm"
         echo "RUN bs ${vllm_bs}"
         # server:
-        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs
         # client:
         $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
     done
@@ -90,7 +90,7 @@ metric_bs(){
         echo "RUN bs ${wo_vllm_bs}"
         bs_dir_wo_vllm=$choice_dir_wo_vllm"/bs_"$wo_vllm_bs
         # server:
-        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_concurrent_queries $wo_vllm_bs
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_ongoing_requests $wo_vllm_bs
         # client:
         $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --simple  --results-dir $bs_dir_wo_vllm
     done
@@ -107,7 +107,7 @@ latency_throughput(){
     tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length
 
     # server
-    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
+    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF
 
     # client
     for i in $(seq 1 $num_iter)
@@ -133,7 +133,7 @@ get_best_latency(){
     choice_dir=${4}
 
     # server
-    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
+    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF
 
     # client
     for i in $(seq 1 $num_iter)

diff --git a/docs/benchmark.md b/docs/benchmark.md
@@ -113,7 +113,7 @@ cpus_per_worker: 24
 ```
 3. Deploy server
 ```cmd
-OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_concurrent_queries 1000 --vllm_max_num_seqs 256 --simple
+OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_ongoing_requests 1000 --max_num_seqs 256 --simple
 ```
 4. Send requests
 ```cmd