Add benchmark_latency.py to docker serving image #12283

gc-fu · 2024-10-28T08:17:03Z

Description

Add benchmark_latency.py to vllm serving image.

This script provides a --profile option which can profile the running of vllm serving engine.

This changes the original script by the following:

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 97afd301..3a80fcbb 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@ from tqdm import tqdm

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
 from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -47,6 +48,9 @@ def main(args: argparse.Namespace):
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
         enable_prefix_caching=args.enable_prefix_caching,
+        load_in_low_bit=args.load_in_low_bit,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        max_num_seqs=args.max_num_seqs,
     )

     sampling_params = SamplingParams(
@@ -67,17 +71,28 @@ def main(args: argparse.Namespace):

     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
+            if args.device == "xpu":
+                with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as p:
+                    llm.generate(dummy_inputs,
+                                sampling_params=sampling_params,
+                                use_tqdm=False)
+                print("Sort by CPU time total...")
+                print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
+                print("Sort by XPU time total...")
+                print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
+            else:
+                with torch.profiler.profile(
+                        activities=[
+                            torch.profiler.ProfilerActivity.CPU,
+                            # torch.profiler.ProfilerActivity.XPU,
+                            torch.profiler.ProfilerActivity.CUDA,
+                        ],
+                        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                            str(profile_dir))) as p:
+                    llm.generate(dummy_inputs,
+                                sampling_params=sampling_params,
+                                use_tqdm=False)
+                print(p.key_averages())
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_inputs,
@@ -281,5 +296,20 @@ if __name__ == '__main__':
         type=str,
         default=None,
         help='Target URL to which OpenTelemetry traces will be sent.')
+    parser.add_argument(
+        "--load-in-low-bit",
+        type=str,
+        choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
+        default="sym_int4",
+        help="Low-bit format quantization with IPEX-LLM")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=4096,
+                        help='maximum number of batched tokens per iteration')
+
+    parser.add_argument('--max-num-seqs',
+                        type=int,
+                        default=256,
+                        help='Maximum number of sequences per iteration.')
     args = parser.parse_args()
     main(args)

result:

Example:

python /llm/benchmark_latency.py \
  --model /llm/models/Llama-2-7b-chat-hf \
  --tensor-parallel-size 1 \
  --input-len 32 \
  --output-len 128 \
  --batch-size 5 \
  --n 1 \
  --num-iters-warmup 3 \
  --num-iters 5 \
  --trust-remote-code \
  --dtype half \
  --enforce-eager \
  --device xpu \
  --block-size 8 \
  --use-v2-block-manager \
  --gpu-memory-utilization 0.90 \
  --load-in-low-bit fp8 \
  --max-num-batched-tokens 4096 \
  --profile \
  --profile-result-dir /llm/benchmark_results

xiangyuT

LGTM

done

4efc499

gc-fu requested a review from xiangyuT October 28, 2024 08:17

xiangyuT approved these changes Oct 28, 2024

View reviewed changes

gc-fu requested review from hzjane and xiangyuT October 28, 2024 08:18

gc-fu merged commit 67014cb into main Oct 28, 2024

gc-fu deleted the add-pr branch October 28, 2024 08:20

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add benchmark_latency.py to docker serving image #12283

Add benchmark_latency.py to docker serving image #12283

gc-fu commented Oct 28, 2024 •

edited

Loading

xiangyuT left a comment

Add benchmark_latency.py to docker serving image #12283

Add benchmark_latency.py to docker serving image #12283

Conversation

gc-fu commented Oct 28, 2024 • edited Loading

Description

xiangyuT left a comment

Choose a reason for hiding this comment

gc-fu commented Oct 28, 2024 •

edited

Loading