diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 6a18be947be99..4036b32a46bf7 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -13,9 +13,17 @@ This benchmark will be *triggered* upon: **Benchmarking Duration**: about 1hr. -## Configuring the workload for the quick benchmark +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. -The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`. + +## Configuring the workload + +The benchmarking workload contains three parts: +- Latency tests in `latency-tests.json`. +- Throughput tests in `throughput-tests.json`. +- Serving tests in `serving-tests.json`. + +See [descriptions.md](tests/descriptions.md) for detailed descriptions. ### Latency test @@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`: ```json [ - ... { "test_name": "latency_llama8B_tp1", "parameters": { @@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`: "num_iters": 15 } }, - ... ] ``` @@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t ``` [ - ... { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], @@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t "num_prompts": 200 } }, - ... ] ``` @@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. ## Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. -The JSON file is also attached within each buildkite job for further analysis. \ No newline at end of file +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh index 6cff6917f8ad5..021473f76d0e5 100644 --- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -343,9 +343,9 @@ main() { QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json - run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json - run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json # postprocess benchmarking results diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 75cff84347942..9aa8162d18d2d 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path import pandas as pd @@ -11,12 +12,13 @@ latency_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "avg_latency": "Average latency (s)", - "P10": "P10 (s)", - "P25": "P25 (s)", - "P50": "P50 (s)", - "P75": "P75 (s)", - "P90": "P90 (s)", + "avg_latency": "Mean latency (ms)", + # "P10": "P10 (s)", + # "P25": "P25 (s)", + "P50": "Median", + # "P75": "P75 (s)", + # "P90": "P90 (s)", + "P99": "P99", } # thoughput tests and the keys that will be printed into markdown @@ -24,11 +26,11 @@ throughput_results_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "num_requests": "# of req.", - "total_num_tokens": "Total # of tokens", - "elapsed_time": "Elapsed time (s)", + # "num_requests": "# of req.", + # "total_num_tokens": "Total # of tokens", + # "elapsed_time": "Elapsed time (s)", "requests_per_second": "Tput (req/s)", - "tokens_per_second": "Tput (tok/s)", + # "tokens_per_second": "Tput (tok/s)", } # serving results and the keys that will be printed into markdown @@ -36,120 +38,148 @@ serving_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - "completed": "# of req.", + # "completed": "# of req.", "request_throughput": "Tput (req/s)", - "input_throughput": "Input Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", "mean_ttft_ms": "Mean TTFT (ms)", # do not say TTFT again to avoid the table getting too wide "median_ttft_ms": "Median", "p99_ttft_ms": "P99", - "mean_tpot_ms": "Mean TPOT (ms)", - "median_tpot_ms": "Median", - "p99_tpot_ms": "P99", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median", "p99_itl_ms": "P99", } -for test_file in results_folder.glob("*.json"): - - with open(test_file, "r") as f: - raw_result = json.loads(f.read()) - - if "serving" in str(test_file): - # this result is generated via `benchmark_serving.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - elif "latency" in f.name: - # this result is generated via `benchmark_latency.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # get different percentiles - for perc in [10, 25, 50, 75, 90]: - raw_result.update( - {f"P{perc}": raw_result["percentiles"][str(perc)]}) - - # add the result to raw_result - latency_results.append(raw_result) - continue - - elif "throughput" in f.name: - # this result is generated via `benchmark_throughput.py` - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - throughput_results.append(raw_result) - continue - - print(f"Skipping {test_file}") - -latency_results = pd.DataFrame.from_dict(latency_results) -serving_results = pd.DataFrame.from_dict(serving_results) -throughput_results = pd.DataFrame.from_dict(throughput_results) - -# remapping the key, for visualization purpose -if not latency_results.empty: - latency_results = latency_results[list( - latency_column_mapping.keys())].rename(columns=latency_column_mapping) -if not serving_results.empty: - serving_results = serving_results[list( - serving_column_mapping.keys())].rename(columns=serving_column_mapping) -if not throughput_results.empty: - throughput_results = throughput_results[list( - throughput_results_column_mapping.keys())].rename( - columns=throughput_results_column_mapping) - -# get markdown tables -latency_md_table = tabulate(latency_results, - headers='keys', - tablefmt='pipe', - showindex=False) -serving_md_table = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) -throughput_md_table = tabulate(throughput_results, - headers='keys', - tablefmt='pipe', - showindex=False) - -# document the result -with open(results_folder / "benchmark_results.md", "w") as f: + +def read_markdown(file): + if os.path.exists(file): + with open(file, "r") as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose if not latency_results.empty: - f.write("## Latency tests\n") - f.write(latency_md_table) - f.write("\n") - if not throughput_results.empty: - f.write("## Throughput tests\n") - f.write(throughput_md_table) - f.write("\n") + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) if not serving_results.empty: - f.write("## Serving tests\n") - f.write(serving_md_table) - f.write("\n") + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + with open(results_folder / "benchmark_results.md", "w") as f: + + results = read_markdown( + "../.buildkite/nightly-benchmarks/tests/descriptions.md") + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md new file mode 100644 index 0000000000000..891e4917070d9 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -0,0 +1,67 @@ + +## Latency tests + +This test suite aims to test vllm's end-to-end latency under a controlled setup. + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: end-to-end latency (mean, median, p99). + +### Latency benchmarking results + +{latency_tests_markdown_table} + +## Throughput tests + +This test suite aims to test vllm's throughput. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput. + +### Throughput benchmarking results + +{throughput_tests_markdown_table} + +## Serving tests + +This test suite aims to test vllm's real serving metrics. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +### Serving benchmarking results + +{serving_tests_markdown_table} + +## json version of the benchmarking tables + +This section contains the data of the markdown tables above in JSON format. +You can load the benchmarking tables into pandas dataframes as follows: + +```python +import json +import pandas as pd + +benchmarking_results_json = """The json string""" +benchmarking_results = json.loads(benchmarking_results_json) +latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) +throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) +serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) +``` + +The json string for all benchmarking tables: +```json +{benchmarking_results_in_json_string} +``` + +You can also check the raw experiment data in the Artifact tab of the Buildkite page. + diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/latency-tests.json rename to .buildkite/nightly-benchmarks/tests/latency-tests.json index 294a8c439c3ae..06488cd79110a 100644 --- a/.buildkite/nightly-benchmarks/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -29,4 +29,4 @@ "num-iters": 15 } } -] +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/serving-tests.json rename to .buildkite/nightly-benchmarks/tests/serving-tests.json index bb674661279d3..86a0fefa339f7 100644 --- a/.buildkite/nightly-benchmarks/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -56,4 +56,4 @@ "num_prompts": 200 } } -] +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json similarity index 99% rename from .buildkite/nightly-benchmarks/throughput-tests.json rename to .buildkite/nightly-benchmarks/tests/throughput-tests.json index db4f908d79971..41ac135748704 100644 --- a/.buildkite/nightly-benchmarks/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -32,4 +32,4 @@ "backend": "vllm" } } -] +] \ No newline at end of file diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 11d1bf7a4c58f..767afd21aeacf 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -98,7 +98,7 @@ def run_to_completion(profile_dir: Optional[str] = None): for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90] + percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) print(f'Avg latency: {np.mean(latencies)} seconds') for percentage, percentile in zip(percentages, percentiles):