Skip to content

Commit

Permalink
[Benchmark] Fix parameters mismatch (#247)
Browse files Browse the repository at this point in the history
* fix parameters mismatch

* fix parameters mismatch

* disable benchmark ci in pr
  • Loading branch information
KepingYan authored Jun 7, 2024
1 parent 4fa68af commit 3e69237
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 10 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/workflow_orders_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,3 @@ jobs:
Finetune:
needs: Lint
uses: ./.github/workflows/workflow_finetune.yml

Benchmark:
needs: Lint
uses: ./.github/workflows/workflow_test_benchmark.yml
10 changes: 5 additions & 5 deletions benchmarks/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ get_peak_throughpt(){
echo "RUN llm-on-ray with vllm"
echo "RUN bs ${vllm_bs}"
# server:
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs
# client:
$NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
done
Expand All @@ -80,7 +80,7 @@ metric_bs(){
echo "RUN llm-on-ray with vllm"
echo "RUN bs ${vllm_bs}"
# server:
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $vllm_bs
# client:
$NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
done
Expand All @@ -90,7 +90,7 @@ metric_bs(){
echo "RUN bs ${wo_vllm_bs}"
bs_dir_wo_vllm=$choice_dir_wo_vllm"/bs_"$wo_vllm_bs
# server:
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_concurrent_queries $wo_vllm_bs
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_ongoing_requests $wo_vllm_bs
# client:
$NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --simple --results-dir $bs_dir_wo_vllm
done
Expand All @@ -107,7 +107,7 @@ latency_throughput(){
tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length

# server
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF

# client
for i in $(seq 1 $num_iter)
Expand All @@ -133,7 +133,7 @@ get_best_latency(){
choice_dir=${4}

# server
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
$NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF

# client
for i in $(seq 1 $num_iter)
Expand Down
2 changes: 1 addition & 1 deletion docs/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ cpus_per_worker: 24
```
3. Deploy server
```cmd
OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_concurrent_queries 1000 --vllm_max_num_seqs 256 --simple
OMP_NUM_THREAD=24 numactl -N 0 -m 0 -C 0-47 python -u inference/serve.py --config_file inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml --keep_serve_terminal --max_ongoing_requests 1000 --max_num_seqs 256 --simple
```
4. Send requests
```cmd
Expand Down

0 comments on commit 3e69237

Please sign in to comment.