Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Benchmarking - Add tensor_parallel_size arg for multi-gpu benchmarking (
Browse files Browse the repository at this point in the history
#66)

SUMMARY:
 -  Add tensor_parallel_size arg for multi-gpu benchmarking.
- Update the benchmark config files to use all available gpus by
default.
 - Other minor changes : 
   - Fix script args outer-product enumeration.
   - Limit dataset sample-space when using sharegpt dataset.
   - Remove redundant benchmarks.
   - Standardize benchmarking result JSON.

TEST PLAN:
Run manual benchmark jobs 
multi-gpu benchmark :
https://github.com/neuralmagic/nm-vllm/actions/runs/8086009234/job/22094787943
single-gpu benchmark :
https://github.com/neuralmagic/nm-vllm/actions/runs/8086016169/job/22094812742
(Then benchmarks didn't run to completion as huggingface went down
mid-way. but the artifacts seem reasonable for what it did run)

---------

Co-authored-by: Varun Sundar Rabindranath <[email protected]>
  • Loading branch information
varun-sundar-rabindranath and Varun Sundar Rabindranath authored Feb 28, 2024
1 parent 13787e6 commit 0367fc2
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 75 deletions.
5 changes: 3 additions & 2 deletions neuralmagic/benchmarks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
key_args_cla = list(map(lambda k: f"--{k}", key_args))

# Remove empty lists from arg_lists and remove key args from keys
arg_lists = filter(lambda arg_list: len(arg_list) != 0, arg_lists)
keys = filter(lambda k: k not in key_args, keys)
arg_lists = list(filter(lambda arg_list: len(arg_list) != 0, arg_lists))
keys = list(filter(lambda k: k not in key_args, keys))
assert len(keys) == len(arg_lists)

for args in itertools.product(*arg_lists):
cla = key_args_cla
Expand Down
33 changes: 2 additions & 31 deletions neuralmagic/benchmarks/configs/benchmark_serving.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
Expand All @@ -27,36 +28,6 @@
"sharegpt"
]
}
},
{
"description": "Benchmark vllm serving",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens": [
4096
],
"sparsity": [],
"script_name": "benchmark_serving",
"script_args": {
"num-prompts_": [
50,
100
],
"request-rate_": [
0.5,
"inf"
],
"best-of": [
1
],
"dataset": [
"sharegpt"
]
}
}
]
}
}
18 changes: 6 additions & 12 deletions neuralmagic/benchmarks/configs/benchmark_throughput.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
"output-len": [
128
],
"tensor-parallel-size": [
1
],
"n": [
1
],
Expand All @@ -34,7 +31,8 @@
],
"dtype": [
"auto"
]
],
"use-all-available-gpus_" : []
}
},
{
Expand Down Expand Up @@ -64,9 +62,6 @@
"output-len": [
1
],
"tensor-parallel-size": [
1
],
"n": [
1
],
Expand All @@ -78,7 +73,8 @@
],
"dtype": [
"auto"
]
],
"use-all-available-gpus_" : []
}
},
{
Expand All @@ -101,9 +97,6 @@
"output-len": [
128
],
"tensor-parallel-size": [
1
],
"n": [
1
],
Expand All @@ -120,7 +113,8 @@
],
"dtype": [
"auto"
]
],
"use-all-available-gpus_" : []
}
}
]
Expand Down
2 changes: 2 additions & 0 deletions neuralmagic/benchmarks/datasets_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def make_dataset_triples(prompts: List[str], completions: List[str],

# Make into dataset tripe.
dataset.append((prompt, prompt_len, output_len))
if (len(dataset) >= dataset_args.num_samples * 2):
break

# Sample num_requests from the list.
print(len(dataset))
Expand Down
48 changes: 41 additions & 7 deletions neuralmagic/benchmarks/run_benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,32 @@

from neuralmagic.tools.call_cmd import call_cmd
from neuralmagic.benchmarks.common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs
from neuralmagic.benchmarks.scripts.common import warmup_server
from neuralmagic.benchmarks.scripts.common import warmup_server, num_available_gpus

BENCH_SERVER_HOST = "localhost"
BENCH_SERVER_PORT = 9000


def get_tensor_parallel_size(config: NamedTuple) -> int:

num_tp_directives = [
hasattr(config, 'tensor_parallel_size'),
hasattr(config, 'use_all_available_gpus')
].count(True)
if num_tp_directives == 0:
# by default - use just one GPU
return 1

# must have exactly one directive
assert num_tp_directives == 1

tensor_parallel_size = config.tensor_parallel_size if hasattr(
config, 'tensor_parallel_size') else num_available_gpus()
assert tensor_parallel_size > 0 and \
tensor_parallel_size <= num_available_gpus()
return tensor_parallel_size


def is_server_running(host: str, port: int, timeout=300) -> bool:

def try_connection() -> bool:
Expand Down Expand Up @@ -62,6 +82,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
assert server_process is not None
server_process.kill()

tensor_parallel_size = get_tensor_parallel_size(config)

script_path = f"neuralmagic.benchmarks.scripts.{config.script_name}"

sparsities = [None] if len(config.sparsity) == 0 else config.sparsity
Expand All @@ -84,10 +106,20 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:

for max_model_len in max_model_lens:

server_cmd = f"python3 -m vllm.entrypoints.api_server --model {model} --tokenizer {model} --max-model-len {max_model_len} --host {BENCH_SERVER_HOST} --port {BENCH_SERVER_PORT} --disable-log-requests"

server_args = {
"model": model,
"tokenizer": model,
"max-model-len": max_model_len,
"host": BENCH_SERVER_HOST,
"port": BENCH_SERVER_PORT,
"tensor-parallel-size": tensor_parallel_size,
"disable-log-requests": ""
}
if sparsity:
server_cmd += f" --sparsity {sparsity} "
server_args["sparsity"] = sparsity

server_cmd = "python3 -m vllm.entrypoints.api_server " + \
" ".join([f"--{k} {v}" for k, v in server_args.items()])

for script_args in script_args_to_cla(config):
bench_cmd = (["python3", "-m"
Expand All @@ -98,9 +130,11 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
["--host", f"{BENCH_SERVER_HOST}"])

if output_directory:
bench_cmd = bench_cmd + [
"--save-directory", f"{output_directory}"
]
bench_cmd += (["--save-directory", f"{output_directory}"] +
["--server-args", f"{server_args}"] + [
"--server-tensor-parallel-size",
f"{tensor_parallel_size}"
])

run_bench(server_cmd, bench_cmd, model)

Expand Down
43 changes: 31 additions & 12 deletions neuralmagic/benchmarks/scripts/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from neuralmagic.benchmarks.scripts.common import get_bench_environment, generate_synthetic_requests, print_benchmark_io
from neuralmagic.benchmarks.scripts.common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
from neuralmagic.benchmarks.datasets_registry import get_dataset, DatasetArgs

from neuralmagic.benchmarks.scripts.backend_request_func import (
Expand Down Expand Up @@ -271,21 +271,19 @@ def main(args: argparse.Namespace):
# Save config and results to json
save_result = args.save_directory is not None
if save_result:
result_json = {}

# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_json = instantiate_benchmark_results_dict(
benchmarking_script_name=Path(__file__).name,
tensor_parallel_size=args.server_tensor_parallel_size,
model=args.model,
tokenizer=args.tokenizer,
dataset=args.dataset)
result_json["date"] = current_dt
result_json["bench_env"] = get_bench_environment()
result_json["backend"] = backend
result_json["version"] = args.version
result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of
result_json["use_beam_search"] = args.use_beam_search
result_json["num_prompts"] = num_prompts
result_json["script_args"] = vars(args)

# Traffic
# Populate derived-args for convenience
result_json["num_prompts"] = num_prompts
result_json["request_rate"] = \
request_rate if request_rate < float("inf") else "inf"

Expand Down Expand Up @@ -388,11 +386,13 @@ def num_prompts_and_request_rate_t(arg) -> Num_Prompts_Request_Rate_T:
action="store_true",
help="Specify to disbale tqdm progress bar.",
)

parser.add_argument("--save-directory",
type=str,
default=None,
help="Output directory to store result file")

# Arguments defining num_prompts and qps
parser.add_argument(
"--num-prompts_",
type=int,
Expand All @@ -418,6 +418,22 @@ def num_prompts_and_request_rate_t(arg) -> Num_Prompts_Request_Rate_T:
""",
default=None)

# Server command args
parser.add_argument(
"--server-tensor-parallel-size",
type=int,
default=None,
help=
"tensor-parallel-size that the benchmarking script was invoked with. It is useful to log this information when storing benchmarking results"
)
parser.add_argument(
"--server-args",
type=str,
default=None,
help=
"When we are logging the output, it is useful to log the arguments passed to the server"
)

def args_sanity_check(args):
# Sanity check real-dataset vs synthetic-dataset usecase
if args.dataset is None:
Expand All @@ -433,6 +449,9 @@ def args_sanity_check(args):
assert args.num_prompts_ is not None and args.request_rate_ is not None
else:
assert args.num_prompts_ is None and args.request_rate_ is None
# Sanity check required logging args
if args.save_directory is not None:
assert args.server_tensor_parallel_size is not None

args = parser.parse_args()
args_sanity_check(args)
Expand Down
32 changes: 24 additions & 8 deletions neuralmagic/benchmarks/scripts/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,18 @@
from pathlib import Path
from typing import List, Optional, Tuple
from transformers import AutoTokenizer
from neuralmagic.benchmarks.scripts.common import get_bench_environment, generate_synthetic_requests, warmup_vllm_engine
from neuralmagic.benchmarks.scripts.common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
from neuralmagic.benchmarks.datasets_registry import get_dataset, DatasetArgs


def get_tensor_parallel_size(args: argparse.Namespace) -> int:
tensor_parallel_size = num_available_gpus() \
if args.use_all_available_gpus_ else args.tensor_parallel_size_
assert tensor_parallel_size > 0 and \
tensor_parallel_size <= num_available_gpus()
return tensor_parallel_size


def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
Expand Down Expand Up @@ -98,7 +106,7 @@ def main(args: argparse.Namespace):
args.model,
args.tokenizer,
args.quantization,
args.tensor_parallel_size,
get_tensor_parallel_size(args),
args.seed,
args.n,
args.use_beam_search,
Expand All @@ -123,15 +131,19 @@ def main(args: argparse.Namespace):
# Save config and results to json
save_result = args.save_directory is not None
if save_result:
result_json = {}

# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_json = instantiate_benchmark_results_dict(
benchmarking_script_name=Path(__file__).name,
tensor_parallel_size=get_tensor_parallel_size(args),
model=args.model,
tokenizer=args.tokenizer,
dataset=args.dataset)
result_json["date"] = current_dt
result_json["bench_env"] = get_bench_environment()
result_json.update(vars(args))
result_json["request_throughput (per second)"] = request_throughput
result_json["token_throughput (per second)"] = token_throughput
result_json["script_args"] = vars(args)
result_json["request_throughput"] = request_throughput
result_json["token_throughput"] = token_throughput

model_id = args.model.replace('/', '_')
# Save to file
Expand Down Expand Up @@ -168,7 +180,6 @@ def main(args: argparse.Namespace):
'-q',
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
Expand Down Expand Up @@ -204,6 +215,11 @@ def main(args: argparse.Namespace):
type=str,
default=None,
help="Output directory to store result file")

tp_group = parser.add_mutually_exclusive_group(required=True)
tp_group.add_argument("--tensor-parallel-size_", type=int, default=None)
tp_group.add_argument("--use-all-available-gpus_", action="store_true")

args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
Expand Down
Loading

0 comments on commit 0367fc2

Please sign in to comment.