Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark_latency.py to docker serving image #12283

Merged
merged 1 commit into from
Oct 28, 2024
Merged

Add benchmark_latency.py to docker serving image #12283

merged 1 commit into from
Oct 28, 2024

Conversation

gc-fu
Copy link
Contributor

@gc-fu gc-fu commented Oct 28, 2024

Description

Add benchmark_latency.py to vllm serving image.

This script provides a --profile option which can profile the running of vllm serving engine.

This changes the original script by the following:

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 97afd301..3a80fcbb 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,6 +11,7 @@ from tqdm import tqdm

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
 from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
@@ -47,6 +48,9 @@ def main(args: argparse.Namespace):
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
         enable_prefix_caching=args.enable_prefix_caching,
+        load_in_low_bit=args.load_in_low_bit,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        max_num_seqs=args.max_num_seqs,
     )

     sampling_params = SamplingParams(
@@ -67,17 +71,28 @@ def main(args: argparse.Namespace):

     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
+            if args.device == "xpu":
+                with torch.autograd.profiler_legacy.profile(enabled=True, use_xpu=True) as p:
+                    llm.generate(dummy_inputs,
+                                sampling_params=sampling_params,
+                                use_tqdm=False)
+                print("Sort by CPU time total...")
+                print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
+                print("Sort by XPU time total...")
+                print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
+            else:
+                with torch.profiler.profile(
+                        activities=[
+                            torch.profiler.ProfilerActivity.CPU,
+                            # torch.profiler.ProfilerActivity.XPU,
+                            torch.profiler.ProfilerActivity.CUDA,
+                        ],
+                        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                            str(profile_dir))) as p:
+                    llm.generate(dummy_inputs,
+                                sampling_params=sampling_params,
+                                use_tqdm=False)
+                print(p.key_averages())
         else:
             start_time = time.perf_counter()
             llm.generate(dummy_inputs,
@@ -281,5 +296,20 @@ if __name__ == '__main__':
         type=str,
         default=None,
         help='Target URL to which OpenTelemetry traces will be sent.')
+    parser.add_argument(
+        "--load-in-low-bit",
+        type=str,
+        choices=["sym_int4", "fp8", "fp8_e4m3", "fp16", "fp6"],
+        default="sym_int4",
+        help="Low-bit format quantization with IPEX-LLM")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=4096,
+                        help='maximum number of batched tokens per iteration')
+
+    parser.add_argument('--max-num-seqs',
+                        type=int,
+                        default=256,
+                        help='Maximum number of sequences per iteration.')
     args = parser.parse_args()
     main(args)

result:

image

Example:

python /llm/benchmark_latency.py \
  --model /llm/models/Llama-2-7b-chat-hf \
  --tensor-parallel-size 1 \
  --input-len 32 \
  --output-len 128 \
  --batch-size 5 \
  --n 1 \
  --num-iters-warmup 3 \
  --num-iters 5 \
  --trust-remote-code \
  --dtype half \
  --enforce-eager \
  --device xpu \
  --block-size 8 \
  --use-v2-block-manager \
  --gpu-memory-utilization 0.90 \
  --load-in-low-bit fp8 \
  --max-num-batched-tokens 4096 \
  --profile \
  --profile-result-dir /llm/benchmark_results

@gc-fu gc-fu requested a review from xiangyuT October 28, 2024 08:17
Copy link
Contributor

@xiangyuT xiangyuT left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@gc-fu gc-fu requested review from hzjane and xiangyuT October 28, 2024 08:18
@gc-fu gc-fu merged commit 67014cb into main Oct 28, 2024
@gc-fu gc-fu deleted the add-pr branch October 28, 2024 08:20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants