Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Add --logprobs argument to benchmark_serving.py #8191

Merged
2 changes: 2 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class RequestFuncInput:
model: str
best_of: int = 1
use_beam_search: bool = False
logprobs: Optional[int] = None


@dataclass
Expand Down Expand Up @@ -236,6 +237,7 @@ async def async_request_openai_completions(
"temperature": 0.0,
"best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
}
headers = {
Expand Down
16 changes: 16 additions & 0 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.

Use --logprobs <num logprobs> to specify the number of logprobs-per-token
to return as part of the request (or leave the argument unspecified
to default to 1 logprob-per-token.
afeldman-nm marked this conversation as resolved.
Show resolved Hide resolved
"""
import argparse
import asyncio
Expand Down Expand Up @@ -318,6 +322,7 @@ async def benchmark(
model_id: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int],
best_of: int,
use_beam_search: bool,
request_rate: float,
Expand All @@ -339,6 +344,7 @@ async def benchmark(
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
use_beam_search=use_beam_search,
)
Expand All @@ -358,6 +364,7 @@ async def benchmark(
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
use_beam_search=use_beam_search,
)
Expand All @@ -379,6 +386,7 @@ async def benchmark(
api_url=api_url,
prompt_len=prompt_len,
output_len=output_len,
logprobs=logprobs,
best_of=best_of,
use_beam_search=use_beam_search,
)
Expand All @@ -396,6 +404,7 @@ async def benchmark(
api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
use_beam_search=use_beam_search,
)
Expand Down Expand Up @@ -580,6 +589,7 @@ def main(args: argparse.Namespace):
model_id=model_id,
tokenizer=tokenizer,
input_requests=input_requests,
logprobs=args.logprobs,
best_of=args.best_of,
use_beam_search=args.use_beam_search,
request_rate=args.request_rate,
Expand Down Expand Up @@ -721,6 +731,12 @@ def main(args: argparse.Namespace):
help=
"Number of output tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--logprobs",
type=int,
default=None,
help="Number of logprobs-per-token to return as part of the request.",
)
parser.add_argument(
"--sonnet-prefix-len",
type=int,
Expand Down
2 changes: 1 addition & 1 deletion tests/multi_step/test_correctness_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_multi_step_llm(
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs
completions endpoint; `None` -> no logprobs
afeldman-nm marked this conversation as resolved.
Show resolved Hide resolved
"""

prompts = example_prompts
Expand Down
Loading