vllm-project · comaniac · Sep 6, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 5, 2024
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
     model: str
     best_of: int = 1
     use_beam_search: bool = False
+    logprobs: Optional[int] = None
 
 
 @dataclass
@@ -236,6 +237,7 @@ async def async_request_openai_completions(
             "temperature": 0.0,
             "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
             "stream": True,
         }
         headers = {

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -21,6 +21,10 @@
     when using tgi backend, add
         --endpoint /generate_stream
     to the end of the command above.
+
+Use --logprobs <num logprobs> to specify the number of logprobs-per-token
+to return as part of the request (or leave the argument unspecified
+to default to 1 logprob-per-token.
 """
 import argparse
 import asyncio
@@ -318,6 +322,7 @@ async def benchmark(
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
+    logprobs: Optional[int],
     best_of: int,
     use_beam_search: bool,
     request_rate: float,
@@ -339,6 +344,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        logprobs=logprobs,
         best_of=best_of,
         use_beam_search=use_beam_search,
     )
@@ -358,6 +364,7 @@ async def benchmark(
             api_url=base_url + "/start_profile",
             prompt_len=test_prompt_len,
             output_len=test_output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -379,6 +386,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -396,6 +404,7 @@ async def benchmark(
             api_url=base_url + "/stop_profile",
             prompt_len=test_prompt_len,
             output_len=test_output_len,
+            logprobs=logprobs,
             best_of=best_of,
             use_beam_search=use_beam_search,
         )
@@ -580,6 +589,7 @@ def main(args: argparse.Namespace):
             model_id=model_id,
             tokenizer=tokenizer,
             input_requests=input_requests,
+            logprobs=args.logprobs,
             best_of=args.best_of,
             use_beam_search=args.use_beam_search,
             request_rate=args.request_rate,
@@ -721,6 +731,12 @@ def main(args: argparse.Namespace):
         help=
         "Number of output tokens per request, used only for sonnet dataset.",
     )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help="Number of logprobs-per-token to return as part of the request.",
+    )
     parser.add_argument(
         "--sonnet-prefix-len",
         type=int,

@@ -57,7 +57,7 @@ def test_multi_step_llm(
                            GPU -> CPU output transfer
       num_prompts: number of example prompts under test
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
+                    completions endpoint; `None` -> no logprobs 
     """
 
     prompts = example_prompts