diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 6a18be947be99..4036b32a46bf7 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -13,9 +13,17 @@ This benchmark will be *triggered* upon:
 
 **Benchmarking Duration**: about 1hr.
 
-## Configuring the workload for the quick benchmark
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
 
-The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
+
+## Configuring the workload
+
+The benchmarking workload contains three parts:
+- Latency tests in `latency-tests.json`.
+- Throughput tests in `throughput-tests.json`.
+- Serving tests in `serving-tests.json`.
+
+See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 
 ### Latency test
 
@@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`:
 
 ```json
 [
-    ...
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
@@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`:
             "num_iters": 15
         }
     },
-    ...
 ]
 ```
 
@@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 
 ```
 [
-    ...
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
@@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
             "num_prompts": 200
         }
     },
-    ...
 ]
 ```
 
@@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
 ## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
-The JSON file is also attached within each buildkite job for further analysis.
\ No newline at end of file
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 6cff6917f8ad5..021473f76d0e5 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -343,9 +343,9 @@ main() {
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
   # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 
 
   # postprocess benchmarking results
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 75cff84347942..9aa8162d18d2d 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 
 import pandas as pd
@@ -11,12 +12,13 @@
 latency_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "avg_latency": "Average latency (s)",
-    "P10": "P10 (s)",
-    "P25": "P25 (s)",
-    "P50": "P50 (s)",
-    "P75": "P75 (s)",
-    "P90": "P90 (s)",
+    "avg_latency": "Mean latency (ms)",
+    # "P10": "P10 (s)",
+    # "P25": "P25 (s)",
+    "P50": "Median",
+    # "P75": "P75 (s)",
+    # "P90": "P90 (s)",
+    "P99": "P99",
 }
 
 # thoughput tests and the keys that will be printed into markdown
@@ -24,11 +26,11 @@
 throughput_results_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
+    # "num_requests": "# of req.",
+    # "total_num_tokens": "Total # of tokens",
+    # "elapsed_time": "Elapsed time (s)",
     "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
+    # "tokens_per_second": "Tput (tok/s)",
 }
 
 # serving results and the keys that will be printed into markdown
@@ -36,120 +38,148 @@
 serving_column_mapping = {
     "test_name": "Test name",
     "gpu_type": "GPU",
-    "completed": "# of req.",
+    # "completed": "# of req.",
     "request_throughput": "Tput (req/s)",
-    "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
     # do not say TTFT again to avoid the table getting too wide
     "median_ttft_ms": "Median",
     "p99_ttft_ms": "P99",
-    "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median",
     "p99_itl_ms": "P99",
 }
 
-for test_file in results_folder.glob("*.json"):
-
-    with open(test_file, "r") as f:
-        raw_result = json.loads(f.read())
-
-    if "serving" in str(test_file):
-        # this result is generated via `benchmark_serving.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        serving_results.append(raw_result)
-        continue
-
-    elif "latency" in f.name:
-        # this result is generated via `benchmark_latency.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # get different percentiles
-        for perc in [10, 25, 50, 75, 90]:
-            raw_result.update(
-                {f"P{perc}": raw_result["percentiles"][str(perc)]})
-
-        # add the result to raw_result
-        latency_results.append(raw_result)
-        continue
-
-    elif "throughput" in f.name:
-        # this result is generated via `benchmark_throughput.py`
-
-        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
-            command = json.loads(f.read())
-        raw_result.update(command)
-
-        # update the test name of this result
-        raw_result.update({"test_name": test_file.stem})
-
-        # add the result to raw_result
-        throughput_results.append(raw_result)
-        continue
-
-    print(f"Skipping {test_file}")
-
-latency_results = pd.DataFrame.from_dict(latency_results)
-serving_results = pd.DataFrame.from_dict(serving_results)
-throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-# remapping the key, for visualization purpose
-if not latency_results.empty:
-    latency_results = latency_results[list(
-        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
-if not serving_results.empty:
-    serving_results = serving_results[list(
-        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
-if not throughput_results.empty:
-    throughput_results = throughput_results[list(
-        throughput_results_column_mapping.keys())].rename(
-            columns=throughput_results_column_mapping)
-
-# get markdown tables
-latency_md_table = tabulate(latency_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-serving_md_table = tabulate(serving_results,
-                            headers='keys',
-                            tablefmt='pipe',
-                            showindex=False)
-throughput_md_table = tabulate(throughput_results,
-                               headers='keys',
-                               tablefmt='pipe',
-                               showindex=False)
-
-# document the result
-with open(results_folder / "benchmark_results.md", "w") as f:
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file, "r") as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # attach the benchmarking command to raw_result
+            with open(test_file.with_suffix(".commands"), "r") as f:
+                command = json.loads(f.read())
+            raw_result.update(command)
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
     if not latency_results.empty:
-        f.write("## Latency tests\n")
-        f.write(latency_md_table)
-        f.write("\n")
-    if not throughput_results.empty:
-        f.write("## Throughput tests\n")
-        f.write(throughput_md_table)
-        f.write("\n")
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
     if not serving_results.empty:
-        f.write("## Serving tests\n")
-        f.write(serving_md_table)
-        f.write("\n")
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    with open(results_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)
diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md
new file mode 100644
index 0000000000000..891e4917070d9
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@@ -0,0 +1,67 @@
+
+## Latency tests
+
+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+### Latency benchmarking results
+
+{latency_tests_markdown_table}
+
+## Throughput tests
+
+This test suite aims to test vllm's throughput.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput.
+
+### Throughput benchmarking results
+
+{throughput_tests_markdown_table}
+
+## Serving tests
+
+This test suite aims to test vllm's real serving metrics.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+### Serving benchmarking results
+
+{serving_tests_markdown_table}
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format. 
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
diff --git a/.buildkite/nightly-benchmarks/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/latency-tests.json
rename to .buildkite/nightly-benchmarks/tests/latency-tests.json
index 294a8c439c3ae..06488cd79110a 100644
--- a/.buildkite/nightly-benchmarks/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -29,4 +29,4 @@
             "num-iters": 15
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/serving-tests.json
rename to .buildkite/nightly-benchmarks/tests/serving-tests.json
index bb674661279d3..86a0fefa339f7 100644
--- a/.buildkite/nightly-benchmarks/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -56,4 +56,4 @@
             "num_prompts": 200
         }
     }
-]
+]
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
similarity index 99%
rename from .buildkite/nightly-benchmarks/throughput-tests.json
rename to .buildkite/nightly-benchmarks/tests/throughput-tests.json
index db4f908d79971..41ac135748704 100644
--- a/.buildkite/nightly-benchmarks/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -32,4 +32,4 @@
             "backend": "vllm"
         }
     }
-]
+]
\ No newline at end of file
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 11d1bf7a4c58f..767afd21aeacf 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -98,7 +98,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
         latencies.append(run_to_completion(profile_dir=None))
     latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
+    percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
     print(f'Avg latency: {np.mean(latencies)} seconds')
     for percentage, percentile in zip(percentages, percentiles):