Skip to content

Commit

Permalink
add benchmark for fix length input and output (#5857)
Browse files Browse the repository at this point in the history
Co-authored-by: Roger Wang <[email protected]>
  • Loading branch information
haichuan1221 and ywang96 authored Jul 7, 2024
1 parent 6206dcb commit 333306a
Showing 1 changed file with 60 additions and 5 deletions.
65 changes: 60 additions & 5 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
Expand Down Expand Up @@ -77,7 +77,6 @@ def sample_sharegpt_requests(
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")

# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
Expand Down Expand Up @@ -185,6 +184,31 @@ def sample_sonnet_requests(
return sampled_requests


def sample_random_requests(
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:

input_lens = np.random.randint(
int(input_len * range_ratio),
input_len + 1,
size=num_prompts,
)
output_lens = np.random.randint(
int(output_len * range_ratio),
output_len + 1,
size=num_prompts,
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
for i in range(args.num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append(
(prompt, int(input_lens[i]), int(output_lens[i])))

return input_requests


async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
Expand All @@ -196,6 +220,7 @@ async def get_request(
if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue

# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# The next request will be sent after the interval.
Expand All @@ -219,7 +244,7 @@ def calculate_metrics(
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note: this may inflate the output token count slightly
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
Expand Down Expand Up @@ -456,6 +481,15 @@ def main(args: argparse.Namespace):
for prompt, prompt_formatted, prompt_len,
output_len in input_requests]

elif args.dataset_name == "random":
input_requests = sample_random_requests(
input_len=args.input_len,
output_len=args.output_len,
num_prompts=args.num_prompts,
range_ratio=args.range_ratio,
tokenizer=tokenizer,
)

else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")

Expand Down Expand Up @@ -549,7 +583,7 @@ def main(args: argparse.Namespace):
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "sonnet"],
choices=["sharegpt", "sonnet", "random"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
Expand All @@ -566,7 +600,7 @@ def main(args: argparse.Namespace):
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default tokenizer.",
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--best-of",
Expand Down Expand Up @@ -609,6 +643,27 @@ def main(args: argparse.Namespace):
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
parser.add_argument(
"--request-rate",
type=float,
Expand Down

0 comments on commit 333306a

Please sign in to comment.