diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index fc578b4972863..a9adda040ca61 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,18 +6,21 @@ from typing import List, Optional, Tuple import torch -from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizerBase) from tqdm import tqdm -from vllm import LLM, SamplingParams -from vllm.transformers_utils.tokenizer import get_tokenizer - def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int], ) -> List[Tuple[str, int, int]]: + if fixed_output_len is not None: + if fixed_output_len < 4: + raise ValueError("output_len too small") + # Load the dataset. with open(dataset_path) as f: dataset = json.load(f) @@ -35,6 +38,8 @@ def sample_requests( tokenized_dataset = [] for i in range(len(dataset)): output_len = len(completion_token_ids[i]) + if fixed_output_len is not None: + output_len = fixed_output_len tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) # Filter out too long sequences. @@ -66,6 +71,7 @@ def run_vllm( trust_remote_code: bool, dtype: str, ) -> float: + from vllm import LLM, SamplingParams llm = LLM( model=model, tokenizer=tokenizer, @@ -160,14 +166,37 @@ def run_hf( return end - start +def run_mii( + requests: List[Tuple[str, int, int]], + model: str, + tensor_parallel_size: int, + output_len: int, +) -> float: + from mii import pipeline + llm = pipeline(model, tensor_parallel=tensor_parallel_size) + prompts = [prompt for prompt, _, _ in requests] + + start = time.perf_counter() + llm(prompts, max_new_tokens=output_len) + end = time.perf_counter() + return end - start + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) # Sample the requests. - tokenizer = get_tokenizer(args.tokenizer, - trust_remote_code=args.trust_remote_code) - requests = sample_requests(args.dataset, args.num_prompts, tokenizer) + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + if args.dataset is None: + # Synthesize a prompt with the given input length. + prompt = "hi" * (args.input_len - 1) + requests = [(prompt, args.input_len, args.output_len) + for _ in range(args.num_prompts)] + else: + requests = sample_requests(args.dataset, args.num_prompts, tokenizer, + args.output_len) if args.backend == "vllm": elapsed_time = run_vllm(requests, args.model, args.tokenizer, @@ -179,6 +208,9 @@ def main(args: argparse.Namespace): elapsed_time = run_hf(requests, args.model, tokenizer, args.n, args.use_beam_search, args.hf_max_batch_size, args.trust_remote_code) + elif args.backend == "mii": + elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, + args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -191,12 +223,21 @@ def main(args: argparse.Namespace): parser = argparse.ArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, - choices=["vllm", "hf"], + choices=["vllm", "hf", "mii"], default="vllm") parser.add_argument("--dataset", type=str, - required=True, + default=None, help="Path to the dataset.") + parser.add_argument("--input-len", + type=int, + default=None, + help="Input prompt length for each request") + parser.add_argument("--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.") parser.add_argument("--model", type=str, default="facebook/opt-125m") parser.add_argument("--tokenizer", type=str, default=None) parser.add_argument('--quantization', @@ -231,6 +272,13 @@ def main(args: argparse.Namespace): 'for FP32 and FP16 models, and BF16 precision ' 'for BF16 models.') args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if args.dataset is None: + assert args.input_len is not None + assert args.output_len is not None + else: + assert args.input_len is None if args.backend == "vllm": if args.hf_max_batch_size is not None: @@ -240,7 +288,18 @@ def main(args: argparse.Namespace): raise ValueError("HF max batch size is required for HF backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") - if args.tokenizer is None: - args.tokenizer = args.model - + elif args.backend == "mii": + if args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.use_beam_search: + raise ValueError("Beam search is not supported for MII backend.") + if args.quantization is not None: + raise ValueError("Quantization is only for vLLM backend.") + if args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + if args.tokenizer != args.model: + raise ValueError("Tokenizer must be the same as the model for MII " + "backend.") main(args)