Benchmarking - Add tensor_parallel_size arg for multi-gpu benchmarking (

#66) SUMMARY: - Add tensor_parallel_size arg for multi-gpu benchmarking. - Update the benchmark config files to use all available gpus by default. - Other minor changes : - Fix script args outer-product enumeration. - Limit dataset sample-space when using sharegpt dataset. - Remove redundant benchmarks. - Standardize benchmarking result JSON. TEST PLAN: Run manual benchmark jobs multi-gpu benchmark : https://github.com/neuralmagic/nm-vllm/actions/runs/8086009234/job/22094787943 single-gpu benchmark : https://github.com/neuralmagic/nm-vllm/actions/runs/8086016169/job/22094812742 (Then benchmarks didn't run to completion as huggingface went down mid-way. but the artifacts seem reasonable for what it did run) --------- Co-authored-by: Varun Sundar Rabindranath <[email protected]>
neuralmagic · Feb 28, 2024 · 0367fc2 · 0367fc2
1 parent 13787e6
commit 0367fc2
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 75 deletions.
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
@@ -44,8 +44,9 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
     key_args_cla = list(map(lambda k: f"--{k}", key_args))
 
     # Remove empty lists from arg_lists and remove key args from keys
-    arg_lists = filter(lambda arg_list: len(arg_list) != 0, arg_lists)
-    keys = filter(lambda k: k not in key_args, keys)
+    arg_lists = list(filter(lambda arg_list: len(arg_list) != 0, arg_lists))
+    keys = list(filter(lambda k: k not in key_args, keys))
+    assert len(keys) == len(arg_lists)
 
     for args in itertools.product(*arg_lists):
         cla = key_args_cla

diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -8,6 +8,7 @@
 				"mistralai/Mistral-7B-Instruct-v0.2",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
+			"use_all_available_gpus" : "",
 			"max_model_lens": [
 				4096
 			],
@@ -27,36 +28,6 @@
 					"sharegpt"
 				]
 			}
-		},
-		{
-			"description": "Benchmark vllm serving",
-			"models": [
-				"facebook/opt-125m",
-				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-				"mistralai/Mistral-7B-Instruct-v0.2",
-				"NousResearch/Llama-2-7b-chat-hf"
-			],
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": [],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"num-prompts_": [
-					50,
-					100
-				],
-				"request-rate_": [
-					0.5,
-					"inf"
-				],
-				"best-of": [
-					1
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
 		}
 	]
-}
+}
diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json
@@ -20,9 +20,6 @@
 				"output-len": [
 					128
 				],
-				"tensor-parallel-size": [
-					1
-				],
 				"n": [
 					1
 				],
@@ -34,7 +31,8 @@
 				],
 				"dtype": [
 					"auto"
-				]
+				],
+				"use-all-available-gpus_" : []
 			}
 		},
 		{
@@ -64,9 +62,6 @@
 				"output-len": [
 					1
 				],
-				"tensor-parallel-size": [
-					1
-				],
 				"n": [
 					1
 				],
@@ -78,7 +73,8 @@
 				],
 				"dtype": [
 					"auto"
-				]
+				],
+				"use-all-available-gpus_" : []
 			}
 		},
 		{
@@ -101,9 +97,6 @@
 				"output-len": [
 					128
 				],
-				"tensor-parallel-size": [
-					1
-				],
 				"n": [
 					1
 				],
@@ -120,7 +113,8 @@
 				],
 				"dtype": [
 					"auto"
-				]
+				],
+				"use-all-available-gpus_" : []
 			}
 		}
 	]

diff --git a/neuralmagic/benchmarks/datasets_registry.py b/neuralmagic/benchmarks/datasets_registry.py
@@ -38,6 +38,8 @@ def make_dataset_triples(prompts: List[str], completions: List[str],
 
         # Make into dataset tripe.
         dataset.append((prompt, prompt_len, output_len))
+        if (len(dataset) >= dataset_args.num_samples * 2):
+            break
 
     # Sample num_requests from the list.
     print(len(dataset))

diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -9,12 +9,32 @@
 
 from neuralmagic.tools.call_cmd import call_cmd
 from neuralmagic.benchmarks.common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs
-from neuralmagic.benchmarks.scripts.common import warmup_server
+from neuralmagic.benchmarks.scripts.common import warmup_server, num_available_gpus
 
 BENCH_SERVER_HOST = "localhost"
 BENCH_SERVER_PORT = 9000
 
 
+def get_tensor_parallel_size(config: NamedTuple) -> int:
+
+    num_tp_directives = [
+        hasattr(config, 'tensor_parallel_size'),
+        hasattr(config, 'use_all_available_gpus')
+    ].count(True)
+    if num_tp_directives == 0:
+        # by default - use just one GPU
+        return 1
+
+    # must have exactly one directive
+    assert num_tp_directives == 1
+
+    tensor_parallel_size = config.tensor_parallel_size if hasattr(
+        config, 'tensor_parallel_size') else num_available_gpus()
+    assert tensor_parallel_size > 0 and \
+           tensor_parallel_size <= num_available_gpus()
+    return tensor_parallel_size
+
+
 def is_server_running(host: str, port: int, timeout=300) -> bool:
 
     def try_connection() -> bool:
@@ -62,6 +82,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
             assert server_process is not None
             server_process.kill()
 
+    tensor_parallel_size = get_tensor_parallel_size(config)
+
     script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}"
 
     sparsities = [None] if len(config.sparsity) == 0 else config.sparsity
@@ -84,10 +106,20 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
 
         for max_model_len in max_model_lens:
 
-            server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --max-model-len {max_model_len} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests"
-
+            server_args = {
+                "model": model,
+                "tokenizer": model,
+                "max-model-len": max_model_len,
+                "host": BENCH_SERVER_HOST,
+                "port": BENCH_SERVER_PORT,
+                "tensor-parallel-size": tensor_parallel_size,
+                "disable-log-requests": ""
+            }
             if sparsity:
-                server_cmd += f" --sparsity {sparsity} "
+                server_args["sparsity"] = sparsity
+
+            server_cmd = "python3 -m vllm.entrypoints.api_server " + \
+                            " ".join([f"--{k} {v}" for k, v in server_args.items()])
 
             for script_args in script_args_to_cla(config):
                 bench_cmd = (["python3", "-m"
@@ -98,9 +130,11 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
                              ["--host", f"{BENCH_SERVER_HOST}"])
 
                 if output_directory:
-                    bench_cmd = bench_cmd + [
-                        "--save-directory", f"{output_directory}"
-                    ]
+                    bench_cmd += (["--save-directory", f"{output_directory}"] +
+                                  ["--server-args", f"{server_args}"] + [
+                                      "--server-tensor-parallel-size",
+                                      f"{tensor_parallel_size}"
+                                  ])
 
                 run_bench(server_cmd, bench_cmd, model)
 

diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -33,7 +33,7 @@
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from neuralmagic.benchmarks.scripts.common import get_bench_environment, generate_synthetic_requests, print_benchmark_io
+from neuralmagic.benchmarks.scripts.common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
 from neuralmagic.benchmarks.datasets_registry import get_dataset, DatasetArgs
 
 from neuralmagic.benchmarks.scripts.backend_request_func import (
@@ -271,21 +271,19 @@ def main(args: argparse.Namespace):
     # Save config and results to json
     save_result = args.save_directory is not None
     if save_result:
-        result_json = {}
 
-        # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json = instantiate_benchmark_results_dict(
+            benchmarking_script_name=Path(__file__).name,
+            tensor_parallel_size=args.server_tensor_parallel_size,
+            model=args.model,
+            tokenizer=args.tokenizer,
+            dataset=args.dataset)
         result_json["date"] = current_dt
-        result_json["bench_env"] = get_bench_environment()
-        result_json["backend"] = backend
-        result_json["version"] = args.version
-        result_json["model_id"] = model_id
-        result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
-        result_json["num_prompts"] = num_prompts
+        result_json["script_args"] = vars(args)
 
-        # Traffic
+        # Populate derived-args for convenience
+        result_json["num_prompts"] = num_prompts
         result_json["request_rate"] =  \
             request_rate if request_rate < float("inf") else "inf"
 
@@ -388,11 +386,13 @@ def num_prompts_and_request_rate_t(arg) -> Num_Prompts_Request_Rate_T:
         action="store_true",
         help="Specify to disbale tqdm progress bar.",
     )
+
     parser.add_argument("--save-directory",
                         type=str,
                         default=None,
                         help="Output directory to store result file")
 
+    # Arguments defining num_prompts and qps
     parser.add_argument(
         "--num-prompts_",
         type=int,
@@ -418,6 +418,22 @@ def num_prompts_and_request_rate_t(arg) -> Num_Prompts_Request_Rate_T:
                             """,
                         default=None)
 
+    # Server command args
+    parser.add_argument(
+        "--server-tensor-parallel-size",
+        type=int,
+        default=None,
+        help=
+        "tensor-parallel-size that the benchmarking script was invoked with. It is useful to log this information when storing benchmarking results"
+    )
+    parser.add_argument(
+        "--server-args",
+        type=str,
+        default=None,
+        help=
+        "When we are logging the output, it is useful to log the arguments passed to the server"
+    )
+
     def args_sanity_check(args):
         # Sanity check real-dataset vs synthetic-dataset usecase
         if args.dataset is None:
@@ -433,6 +449,9 @@ def args_sanity_check(args):
             assert args.num_prompts_ is not None and args.request_rate_ is not None
         else:
             assert args.num_prompts_ is None and args.request_rate_ is None
+        # Sanity check required logging args
+        if args.save_directory is not None:
+            assert args.server_tensor_parallel_size is not None
 
     args = parser.parse_args()
     args_sanity_check(args)

diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
@@ -13,10 +13,18 @@
 from pathlib import Path
 from typing import List, Optional, Tuple
 from transformers import AutoTokenizer
-from neuralmagic.benchmarks.scripts.common import get_bench_environment, generate_synthetic_requests, warmup_vllm_engine
+from neuralmagic.benchmarks.scripts.common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
 from neuralmagic.benchmarks.datasets_registry import get_dataset, DatasetArgs
 
 
+def get_tensor_parallel_size(args: argparse.Namespace) -> int:
+    tensor_parallel_size = num_available_gpus() \
+        if args.use_all_available_gpus_ else args.tensor_parallel_size_
+    assert tensor_parallel_size > 0 and \
+           tensor_parallel_size <= num_available_gpus()
+    return tensor_parallel_size
+
+
 def run_vllm(
     requests: List[Tuple[str, int, int]],
     model: str,
@@ -98,7 +106,7 @@ def main(args: argparse.Namespace):
                             args.model,
                             args.tokenizer,
                             args.quantization,
-                            args.tensor_parallel_size,
+                            get_tensor_parallel_size(args),
                             args.seed,
                             args.n,
                             args.use_beam_search,
@@ -123,15 +131,19 @@ def main(args: argparse.Namespace):
     # Save config and results to json
     save_result = args.save_directory is not None
     if save_result:
-        result_json = {}
 
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json = instantiate_benchmark_results_dict(
+            benchmarking_script_name=Path(__file__).name,
+            tensor_parallel_size=get_tensor_parallel_size(args),
+            model=args.model,
+            tokenizer=args.tokenizer,
+            dataset=args.dataset)
         result_json["date"] = current_dt
-        result_json["bench_env"] = get_bench_environment()
-        result_json.update(vars(args))
-        result_json["request_throughput (per second)"] = request_throughput
-        result_json["token_throughput (per second)"] = token_throughput
+        result_json["script_args"] = vars(args)
+        result_json["request_throughput"] = request_throughput
+        result_json["token_throughput"] = token_throughput
 
         model_id = args.model.replace('/', '_')
         # Save to file
@@ -168,7 +180,6 @@ def main(args: argparse.Namespace):
                         '-q',
                         choices=['awq', 'gptq', 'squeezellm', None],
                         default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
     parser.add_argument("--n",
                         type=int,
                         default=1,
@@ -204,6 +215,11 @@ def main(args: argparse.Namespace):
                         type=str,
                         default=None,
                         help="Output directory to store result file")
+
+    tp_group = parser.add_mutually_exclusive_group(required=True)
+    tp_group.add_argument("--tensor-parallel-size_", type=int, default=None)
+    tp_group.add_argument("--use-all-available-gpus_", action="store_true")
+
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model