From be41d264db9489bea4b66587e2b8c4feb3d1fae3 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 30 Apr 2024 16:53:05 -0700
Subject: [PATCH] add refactored LLM benchmark code, initial commit

---
 .../inference/llm-bench/src/__init__.py       |   0
 .../inference/llm-bench/src/arg_parsing.py    |  39 +++
 .../llm-bench/src/benchmark_runner.py         | 181 ++++++++++++++
 .../llm-bench/src/clients/__init__.py         |   7 +
 .../llm-bench/src/clients/azure_ml_client.py  |  66 +++++
 .../inference/llm-bench/src/clients/base.py   |  35 +++
 .../llm-bench/src/clients/dummy_client.py     |  37 +++
 .../llm-bench/src/clients/fastgen_client.py   |  79 ++++++
 .../llm-bench/src/clients/vllm_client.py      |  82 +++++++
 benchmarks/inference/llm-bench/src/config.py  |  12 +
 benchmarks/inference/llm-bench/src/prompt.py  |  61 +++++
 .../inference/llm-bench/src/response.py       |   6 +
 .../inference/llm-bench/src/sample_input.py   | 225 ++++++++++++++++++
 benchmarks/inference/llm-bench/src/status.py  |   8 +
 14 files changed, 838 insertions(+)
 create mode 100644 benchmarks/inference/llm-bench/src/__init__.py
 create mode 100644 benchmarks/inference/llm-bench/src/arg_parsing.py
 create mode 100644 benchmarks/inference/llm-bench/src/benchmark_runner.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/__init__.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/azure_ml_client.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/base.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/dummy_client.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/fastgen_client.py
 create mode 100644 benchmarks/inference/llm-bench/src/clients/vllm_client.py
 create mode 100644 benchmarks/inference/llm-bench/src/config.py
 create mode 100644 benchmarks/inference/llm-bench/src/prompt.py
 create mode 100644 benchmarks/inference/llm-bench/src/response.py
 create mode 100644 benchmarks/inference/llm-bench/src/sample_input.py
 create mode 100644 benchmarks/inference/llm-bench/src/status.py

diff --git a/benchmarks/inference/llm-bench/src/__init__.py b/benchmarks/inference/llm-bench/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/inference/llm-bench/src/arg_parsing.py b/benchmarks/inference/llm-bench/src/arg_parsing.py
new file mode 100644
index 000000000..9c61989e1
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/arg_parsing.py
@@ -0,0 +1,39 @@
+import argparse
+from .clients import client_config_classes
+from .benchmark_runner import BenchmarkConfig
+
+def parse_args_to_configs():
+    def add_model(parser, model):
+        fields = model.model_fields
+        for name, field in fields.items():
+            nargs = None
+            field_type = field.annotation
+            if getattr(field.annotation, "_name", "") == "List":
+                nargs = "+"
+                field_type = field.annotation.__args__[0]
+            parser.add_argument(
+                f"--{name}", 
+                dest=name, 
+                nargs=nargs,
+                type=field_type,
+                required=getattr(field, "required", False),
+                default=getattr(field, "default", None),
+                help=getattr(field, "description", ""),
+            )
+    parser = argparse.ArgumentParser()
+    add_model(parser, BenchmarkConfig)
+    args, remaining_args = parser.parse_known_args()
+    unused_args = set(remaining_args)
+    benchmark_config = BenchmarkConfig(**vars(args))
+
+    client_config_class = client_config_classes[benchmark_config.api]
+    parser = argparse.ArgumentParser()
+    add_model(parser, client_config_class)
+    args, remaining_args = parser.parse_known_args()
+    unused_args = unused_args.intersection(remaining_args)
+    client_config = client_config_class(**vars(args))
+
+    if unused_args:
+        raise ValueError(f"Unused arguments: {unused_args}")
+
+    return benchmark_config, client_config
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/benchmark_runner.py b/benchmarks/inference/llm-bench/src/benchmark_runner.py
new file mode 100644
index 000000000..d3315ff9a
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/benchmark_runner.py
@@ -0,0 +1,181 @@
+from .config import BaseConfigModel
+from .prompt import PromptGenerator, PromptConfig
+from .clients import client_classes
+from typing import List, Optional
+from pydantic import Field
+from pathlib import Path
+import multiprocessing
+import threading
+import queue
+import time
+import yaml
+import itertools
+from tqdm import tqdm
+from loguru import logger
+
+
+class BenchmarkConfig(BaseConfigModel):
+    model: str = Field(..., description="HuggingFace.co model name")
+    api: str = "azure_ml"
+    warmup_requests: int = 1
+    result_dir: Path = Path("./results")
+    use_threading: bool = False
+    config_files: List[Path] = []
+    num_clients: List[int] = [1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32]
+    prompt_generator_seed: Optional[int] = None
+    num_requests_per_client: int = 16
+    max_prompt_length: int = 4000
+    prompt_length: List[int] = [2600]
+    prompt_length_var: float = 0.3
+    max_new_tokens: List[int] = [60]
+    max_new_tokens_var: float = 0.3
+    streaming: bool = False
+
+class BenchmarkRunner():
+    def __init__(self, benchmark_config: BaseConfigModel, client_config: BaseConfigModel) -> None:
+        logger.info("Initializing Benchmark Runner")
+        self.config = benchmark_config
+        self.client_config = client_config
+        self.client_class = client_classes[self.config.api]
+
+        self.runnable_cls = multiprocessing.Process
+        self.barrier_cls = multiprocessing.Barrier
+        self.queue_cls = multiprocessing.Queue
+        if self.config.use_threading:
+            self.runnable_cls = threading.Thread
+            self.barrier_cls = threading.Barrier
+            self.queue_cls = queue.Queue
+
+    def _generate_prompts(self, prompt_config: PromptConfig, num_clients: int) -> None:
+        logger.info("Generating Prompts")
+        prompt_generator = PromptGenerator(prompt_config)
+        warmup_prompts = self.config.warmup_requests * num_clients
+        workload_prompts = self.config.num_requests_per_client * num_clients
+        for prompt in prompt_generator(warmup_prompts + workload_prompts):
+            self.query_queue.put(prompt)
+        logger.info(f"Generated {warmup_prompts} warmup and {workload_prompts} workload prompts.")
+
+    def _launch_clients(self, num_clients):
+        logger.info(f"Launching {num_clients} client(s)")
+        self.barrier = self.barrier_cls(num_clients + 1)
+        processes = [
+            self.runnable_cls(
+                target=self._run_client,
+                args=(
+                    self.barrier,
+                    self.query_queue,
+                    self.result_queue,
+                    self.client_class,
+                    self.client_config,
+                    self.config.warmup_requests,
+                ),
+            )
+            for _ in range(num_clients)
+        ]
+        for p in processes:
+            p.start()
+
+        total_prompts = num_clients * self.config.num_requests_per_client
+        pbar = tqdm(total=total_prompts)
+
+        self.barrier.wait() # Barrier 1 for master process
+
+        num_results = 0
+        while num_results != total_prompts:
+            num_results = self.result_queue.qsize()
+            pbar.update(num_results - pbar.n)
+            time.sleep(1)
+        pbar.close()
+
+        self.barrier.wait() # Barrier 2 for master process
+
+
+    @staticmethod
+    def _run_client(barrier, query_queue, result_queue, client_class, client_config, warmup_requests):
+        client = client_class(client_config)
+
+        for _ in range(warmup_requests):
+            prompt = query_queue.get(timeout=1.0)
+            request_kwargs = client.prepare_request(prompt)
+            raw_response = client.send_request(request_kwargs)
+            response = client.process_response(raw_response)
+
+        barrier.wait() # Barrier 1 for client process
+        try:
+            while not query_queue.empty():
+                prompt = query_queue.get(timeout=1.0)
+                request_kwargs = client.prepare_request(prompt)
+                start_time = time.time()
+                raw_response = client.send_request(request_kwargs)
+                end_time = time.time()
+                response = client.process_response(raw_response)
+                response.request_time = end_time - start_time
+                result_queue.put_nowait(response)
+        except queue.Empty:
+            pass
+
+        barrier.wait() # Barrier 2 for client process
+
+    def _benchmark_settings(self):
+        prompt_config_keys = list(PromptConfig.model_fields.keys()) + ["num_clients"]
+
+        configs_list = []
+        for f in self.config.config_files:
+            logger.info(f"Generating benchmark run settings from config file: {f}")
+            with open(f, "r") as fh:
+                file_config = yaml.safe_load(fh)
+            for key in prompt_config_keys:
+                if key not in file_config:
+                    file_config[key] = getattr(self.config, key)
+            configs_list.append(file_config)
+
+        if not configs_list:
+            logger.info(f"Generating benchmark run settings from command line args")
+            configs_list.append({key: getattr(self.config, key) for key in prompt_config_keys})
+
+        all_config_product = []
+        for config in configs_list:
+            for k, v in config.items():
+                if not isinstance(v, list) or isinstance(v, tuple):
+                    config[k] = [v]
+            for vals in itertools.product(*[config[k] for k in prompt_config_keys]):
+                all_config_product.append({k:v for k,v in zip(prompt_config_keys, vals)})
+
+        logger.info(f"Generated {len(all_config_product)} benchmark run setting(s)")
+
+        for config in all_config_product:
+            num_clients = config.pop("num_clients")
+            prompt_config = PromptConfig(**config)
+            yield num_clients, prompt_config
+
+    def _clear_queues(self):
+        self.query_queue = self.queue_cls()
+        self.result_queue = self.queue_cls()
+
+    def _save_results(self, num_clients, prompt_config):
+        response_details = []
+        while len(response_details) != num_clients * self.config.num_requests_per_client:
+            res = self.result_queue.get()
+            # vLLM returns concatinated tokens
+            response_details.append(res)
+        return response_details
+
+    def run(self):
+        self.client_class.start_service(self.client_config)
+        for num_clients, prompt_config in self._benchmark_settings():
+            logger.info(f"Running benchmark with {num_clients} client(s) and prompt config: {prompt_config}")
+            self._clear_queues()
+            self._generate_prompts(prompt_config=prompt_config, num_clients=num_clients)
+            #self._prepare_requests()
+            self._launch_clients(num_clients=num_clients)
+            #self._process_repsonses()
+            rd = self._save_results(prompt_config=prompt_config, num_clients=num_clients)
+            print(len(rd))
+        self.client_class.stop_service(self.client_config)
+        
+
+if __name__ == "__main__":
+    from .arg_parsing import parse_args_to_configs
+    benchmark_config, client_config = parse_args_to_configs()
+    benchmark_runner = BenchmarkRunner(benchmark_config, client_config)
+    benchmark_runner.run()
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/__init__.py b/benchmarks/inference/llm-bench/src/clients/__init__.py
new file mode 100644
index 000000000..61a4a0e89
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/__init__.py
@@ -0,0 +1,7 @@
+from .azure_ml_client import AzureMLClientConfig, AzureMLClient
+from .fastgen_client import FastGenClientConfig, FastGenClient
+from .vllm_client import vLLMClientConfig, vLLMClient
+from .dummy_client import DummyClientConfig, DummyClient
+
+client_config_classes = {"dummy": DummyClientConfig,"azure_ml": AzureMLClientConfig, "fastgen": FastGenClientConfig, "vllm": vLLMClientConfig}
+client_classes = {"dummy": DummyClient, "azure_ml": AzureMLClient, "fastgen": FastGenClient, "vllm": vLLMClient}
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/azure_ml_client.py b/benchmarks/inference/llm-bench/src/clients/azure_ml_client.py
new file mode 100644
index 000000000..de54f2e6f
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/azure_ml_client.py
@@ -0,0 +1,66 @@
+from .base import BaseClient
+from ..config import BaseConfigModel
+from ..status import Status
+from ..prompt import Prompt
+from ..response import Response
+
+import requests
+import json
+from typing import Any, Dict, Optional
+
+
+class AzureMLClientConfig(BaseConfigModel):
+    api_url: str = ""
+    api_key: str = ""
+    deployment_name: str = ""
+
+class AzureMLClient(BaseClient):
+    def __init__(self, config: AzureMLClientConfig) -> None:
+        self.api_url = config.api_url
+        self.api_key = config.api_key
+        self.deployment_name = config.deployment_name
+
+    @staticmethod
+    def start_service(config: AzureMLClientConfig) -> Status:
+        pass
+
+    @staticmethod
+    def stop_service(config: AzureMLClientConfig) -> Status:
+        pass
+
+    def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
+        if prompt.streaming:
+            raise ValueError("AzureMLClient does not support streaming prompts.")
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": ("Bearer " + self.api_key),
+            "azureml-model-deployment": self.deployment_name,
+        }
+        pload = {
+            "input_data": {
+                "input_string": [
+                    prompt.text,
+                ],
+                "parameters": {
+                    "max_tokens": prompt.max_new_tokens,
+                    "return_full_text": prompt.return_full_text,
+                },
+            }
+        }
+        return {"url": self.api_url, "headers": headers, "json": pload, "timeout": 180}
+
+    def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
+        while True:
+            try: # Sometimes the AML endpoint will return an error, so we send the request again
+                response = requests.post(**request_kwargs)
+                output = json.loads(response.content)
+                break
+            except Exception as e:
+                print(f"Connection failed with {e}. Retrying AML request")
+
+        return output
+
+    def process_response(self, raw_response: Any) -> Response:
+        response_text = raw_response[0]
+        return Response(response_text)
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/base.py b/benchmarks/inference/llm-bench/src/clients/base.py
new file mode 100644
index 000000000..8e5ec26dd
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/base.py
@@ -0,0 +1,35 @@
+from abc import ABC, abstractmethod
+
+from ..prompt import Prompt
+from ..response import Response
+from ..status import Status
+from ..config import BaseConfigModel
+
+from typing import Any, Dict
+
+class BaseClient(ABC):
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def start_service(config: BaseConfigModel) -> Status:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def stop_service(config: BaseConfigModel) -> Status:
+        pass
+
+    @abstractmethod
+    def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
+        pass
+
+    @abstractmethod
+    def send_request(self, request_kwargs: Dict[str,Any]) -> Any:
+        pass
+
+    @abstractmethod
+    def process_response(self, raw_response: Any) -> Response:
+        pass
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/dummy_client.py b/benchmarks/inference/llm-bench/src/clients/dummy_client.py
new file mode 100644
index 000000000..b40771c4b
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/dummy_client.py
@@ -0,0 +1,37 @@
+from abc import ABC, abstractmethod
+
+from ..config import BaseConfigModel
+from ..prompt import Prompt
+from ..response import Response
+from ..status import Status
+from .base import BaseClient
+
+from typing import Any, Dict
+import time
+import random
+
+class DummyClientConfig(BaseConfigModel):
+    pass
+
+class DummyClient(BaseClient):
+    def __init__(self, config: DummyClientConfig) -> None:
+        pass
+
+    @staticmethod
+    def start_service(config: DummyClientConfig) -> Status:
+        return Status("OK")
+
+    @staticmethod
+    def stop_service(config: DummyClientConfig) -> Status:
+        return Status("OK")
+
+    def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
+        return {"input_text": prompt.text, "max_new_tokens": prompt.max_new_tokens}
+
+    def send_request(self, request_kwargs: Dict[str,Any]) -> Any:
+        time.sleep(random.uniform(1, 2))
+        #time.sleep(1)
+        return request_kwargs["input_text"]*2
+
+    def process_response(self, raw_response: Any) -> Response:
+        return Response(raw_response)
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/fastgen_client.py b/benchmarks/inference/llm-bench/src/clients/fastgen_client.py
new file mode 100644
index 000000000..7bd008288
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/fastgen_client.py
@@ -0,0 +1,79 @@
+from .base import BaseClient
+from ..status import Status
+from ..prompt import Prompt
+from ..response import Response
+from ..config import BaseConfigModel
+from pydantic import Field
+from typing import Optional, Dict, Any
+import time
+
+class FastGenClientConfig(BaseConfigModel):
+    model: str = Field(..., description="HuggingFace.co model name")
+    deployment_name: str = "fastgen-benchmark-deployment"
+    tp_size: int = 1
+    num_replicas: int = 1
+    max_ragged_batch_size: int = 768
+    quantization_mode: Optional[str] = None
+
+class FastGenClient(BaseClient):
+    def __init__(self, config: FastGenClientConfig):
+        import mii
+        self.mii_client = mii.client(config.deployment_name)
+        self.streaming = config.streaming
+
+    @staticmethod
+    def start_service(config: FastGenClientConfig) -> Status:
+        import mii
+        from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig
+        from deepspeed.inference.v2.ragged import DSStateManagerConfig
+
+        tp_config = DeepSpeedTPConfig(tp_size=config.tp_size)
+        mgr_config = DSStateManagerConfig(
+            max_ragged_batch_size=config.max_ragged_batch_size,
+            max_ragged_sequence_count=config.max_ragged_batch_size,
+        )
+        inference_config = RaggedInferenceEngineConfig(
+            tensor_parallel=tp_config, state_manager=mgr_config
+        )
+        mii.serve(
+            config.model,
+            deployment_name=config.deployment_name,
+            tensor_parallel=config.tp_size,
+            inference_engine_config=inference_config,
+            replica_num=config.num_replicas,
+            quantization_mode=config.quantization_mode
+        )
+
+    @staticmethod
+    def stop_service(config: FastGenClientConfig) -> Status:
+        import mii
+        mii.client(config.deployment_name).terminate_server()
+
+    def _streaming_callback(self, raw_response) -> None:
+        self.streaming_response_tokens.append(raw_response[0].generated_text)
+        time_now = time.time()
+        self.streaming_token_gen_time.append(time_now - time_last_token)
+        time_last_token = time_now
+
+    def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
+        request_kwargs = {"prompts": prompt.text, "max_new_tokens": prompt.max_new_tokens}
+        if self.streaming:
+            self.streaming_response_tokens = []
+            self.streaming_token_gen_time = []
+            self.streaming_time_last_token = None
+            request_kwargs["streaming_fn"] = self._streaming_callback
+        return request_kwargs
+
+    def send_request(self, request_kwargs: Dict[str,Any]) -> Any:
+        if self.streaming:
+            self.streaming_time_last_token = time.time()
+        response = self.mii_client(**request_kwargs)
+        if self.streaming:
+            response = self.streaming_response_tokens
+        
+        return response
+
+    def process_response(self, raw_response: Any) -> Response:
+        if not self.streaming:
+            raw_response = raw_response[0].generated_text
+        return Response(raw_response)
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/clients/vllm_client.py b/benchmarks/inference/llm-bench/src/clients/vllm_client.py
new file mode 100644
index 000000000..f31f65ad1
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/clients/vllm_client.py
@@ -0,0 +1,82 @@
+from .base import BaseClient
+from ..status import Status
+from ..prompt import Prompt
+from ..response import Response
+from ..config import BaseConfigModel
+from pydantic import Field
+from typing import Optional, Dict, Any
+import subprocess
+import time
+import requests
+import json
+
+class vLLMClientConfig(BaseConfigModel):
+    model: str = Field(..., description="HuggingFace.co model name")
+    tp_size: int = 1
+    port: int = 26500
+
+class vLLMClient(BaseClient):
+    def __init__(self, config: vLLMClientConfig) -> None:
+        pass
+
+    @staticmethod
+    def start_service(config: vLLMClientConfig) -> Status:
+        vllm_cmd = (
+            "python",
+            "-m",
+            "vllm.entrypoints.api_server",
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(config.port),
+            "--tensor-parallel-size",
+            str(config.tp_size),
+            "--model",
+            config.model,
+        )
+        p = subprocess.Popen(
+            vllm_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, close_fds=True
+        )
+        start_time = time.time()
+        timeout_after = 60 * 5  # 5 minutes
+        while True:
+            line = p.stderr.readline().decode("utf-8")
+            if "Application startup complete" in line:
+                break
+            if "error" in line.lower():
+                p.terminate()
+                #self.stop_service(config)
+                raise RuntimeError(f"Error starting VLLM server: {line}")
+            if time.time() - start_time > timeout_after:
+                p.terminate()
+                #self.stop_service(config)
+                raise TimeoutError("Timed out waiting for VLLM server to start")
+            time.sleep(0.01)
+
+    @staticmethod
+    def stop_service(config: vLLMClientConfig) -> Status:
+        vllm_cmd = ("pkill", "-f", "vllm.entrypoints.api_server")
+        p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        p.wait()
+
+    def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
+        api_url = "http://localhost:26500/generate"
+        headers = {"User-Agent": "Benchmark Client"}
+        pload = {
+            "prompt": prompt.text,
+            "n": 1,
+            "use_beam_search": False,
+            "temperature": 1.0,
+            "top_p": 0.9,
+            "max_tokens": prompt.max_new_tokens,
+            "ignore_eos": False,
+        }
+        return {"url": api_url, "headers": headers, "json": pload, "timeout": 180}
+
+    def send_request(self, request_kwargs: Dict[str,Any]) -> Any:
+        response = requests.post(**request_kwargs)
+        output = json.loads(response.content)
+        return output
+
+    def process_response(self, raw_response: Any) -> Response:
+        return Response(raw_response["text"])
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/config.py b/benchmarks/inference/llm-bench/src/config.py
new file mode 100644
index 000000000..74e4042fb
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/config.py
@@ -0,0 +1,12 @@
+from pydantic import BaseModel, ConfigDict
+
+class BaseConfigModel(BaseModel):
+    model_config = ConfigDict(
+        validate_default=True,
+        validate_assignment=True,
+        use_enum_values=True,
+        populate_by_name=True,
+        extra="forbid",
+        arbitrary_types_allowed=True,
+        #protected_namespaces=(),
+    )
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/prompt.py b/benchmarks/inference/llm-bench/src/prompt.py
new file mode 100644
index 000000000..b85d8229e
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/prompt.py
@@ -0,0 +1,61 @@
+from typing import Iterable, Optional
+from .config import BaseConfigModel
+from dataclasses import dataclass
+import numpy as np
+from transformers import AutoTokenizer
+
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+@dataclass
+class Prompt:
+    text: str
+    num_prompt_tokens: int
+    max_new_tokens: int
+    streaming: bool = False
+    return_full_text: bool = False
+
+class PromptConfig(BaseConfigModel):
+    model: str
+    max_prompt_length: int
+    prompt_length: int
+    prompt_length_var: float
+    max_new_tokens: int
+    max_new_tokens_var: float
+    streaming: bool
+
+class PromptGenerator():
+    def __init__(self, config: PromptConfig) -> None:
+        self.model = config.model
+        self.max_prompt_length = config.max_prompt_length
+        self.prompt_length = config.prompt_length
+        self.prompt_length_var = config.prompt_length_var
+        self.max_new_tokens = config.max_new_tokens
+        self.max_new_tokens_var = config.max_new_tokens_var
+        #TODO: Make this better
+        from .sample_input import all_text
+        self.input_text = all_text
+        self._load_tokenizer()
+
+    def _load_input_text(self) -> None:
+        pass
+
+    def _load_tokenizer(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
+
+    def __call__(self, num_prompts: Optional[int] = None) -> Iterable[Prompt]:
+        tokenized_input = self.tokenizer.batch_encode_plus(
+            [self.input_text], return_tensors="pt", padding=False
+        )["input_ids"][0]
+
+        if num_prompts is None:
+            num_prompts = self.config.num_prompts
+
+        for i in range(num_prompts):
+            prompt_length = min(int(np.random.normal(self.prompt_length, self.prompt_length_var)), self.max_prompt_length)
+            max_new_tokens = int(np.random.normal(self.max_new_tokens, self.max_new_tokens_var))
+            yield Prompt(
+                text=self.tokenizer.decode(tokenized_input[i:prompt_length + i]),
+                num_prompt_tokens=prompt_length,
+                max_new_tokens=max_new_tokens,
+            )
diff --git a/benchmarks/inference/llm-bench/src/response.py b/benchmarks/inference/llm-bench/src/response.py
new file mode 100644
index 000000000..834315d2f
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/response.py
@@ -0,0 +1,6 @@
+from dataclasses import dataclass
+
+@dataclass
+class Response:
+    text: str
+    request_time: float = 0
\ No newline at end of file
diff --git a/benchmarks/inference/llm-bench/src/sample_input.py b/benchmarks/inference/llm-bench/src/sample_input.py
new file mode 100644
index 000000000..bae18ce62
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/sample_input.py
@@ -0,0 +1,225 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This is a sample input consisting of:
+# Code & Text
+
+all_text = """Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output.
+              During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss.
+              The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates.
+              Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh).
+              By adjusting the parameters of the neural network during training, deep learning models learn to represent and generalize from complex data patterns. They have achieved remarkable success in various tasks, including image recognition, speech recognition, and natural language processing.
+              Here are the key fundamentals of deep learning for training large language models:
+              Neural Networks: At the heart of deep learning are artificial neural networks, which are inspired by the structure and functioning of biological neurons in the human brain. These networks consist of interconnected layers of artificial neurons called nodes or units. The nodes receive input, perform computations, and pass the results to the next layer.
+              Representation Learning: Deep learning models excel at learning meaningful representations of data. In the context of language, the models can automatically learn hierarchical representations of text, capturing complex relationships and semantic structures.
+              Feedforward and Backpropagation: Deep learning models typically use feedforward neural networks, where information flows from the input layer through intermediate hidden layers to the output layer. The network makes predictions based on the input data, and the prediction error is then backpropagated through the network. Backpropagation calculates gradients that indicate how each parameter in the network should be adjusted to minimize the error.
+              Activation Functions: Activation functions introduce non-linearities to neural networks, enabling them to learn complex patterns. Common activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). These functions determine the output of each neuron based on its weighted inputs.
+              Loss Functions: During training, a loss function is used to measure the discrepancy between the predicted output of the neural network and the desired output. In language modeling tasks, common loss functions include cross-entropy loss, which quantifies the difference in probability distributions.
+              Optimization Algorithms: Optimization algorithms determine how the network's parameters are updated based on the calculated gradients during backpropagation. Stochastic Gradient Descent (SGD) is a widely used algorithm that iteratively updates the parameters in the direction that minimizes the loss. Variants of SGD, such as Adam or RMSprop, adaptively adjust the learning rate to accelerate convergence.
+              Regularization Techniques: Deep learning models are prone to overfitting, where they memorize the training data but fail to generalize well to unseen examples. Regularization techniques such as dropout and weight decay are commonly used to prevent overfitting and improve generalization by adding constraints to the model's parameters.
+              Training on Large-Scale Datasets: Deep learning models, including large language models, require substantial amounts of labeled training data to learn effectively. Large-scale datasets are crucial to expose the model to diverse language patterns and ensure it captures a broad understanding of language.
+              Parallel Computing: Training large language models is computationally demanding. To accelerate the training process, parallel computing techniques, such as using multiple GPUs or distributed computing systems, are employed. These techniques allow for efficient processing of large datasets and speeding up the training iterations.
+              Transfer Learning and Fine-tuning: Transfer learning is a technique where a pre-trained model, trained on a large-scale dataset, is used as a starting point for a new task or dataset. Fine-tuning involves adjusting the pre-trained model's parameters on the new dataset to adapt it to the specific task at hand. This approach significantly reduces the training time and data requirements for new models.
+              The training process of a large language model typically involves the following steps:
+              Data Collection: A diverse and comprehensive dataset is collected, which typically consists of a vast range of text from sources like books, websites, articles, and other textual resources. The quality and variety of the dataset are crucial to ensure the model learns a broad understanding of language.
+              Preprocessing: The collected text data is preprocessed to clean and normalize it. This step involves removing irrelevant characters or symbols, converting the text to a consistent format, and organizing it into smaller units such as sentences or paragraphs.
+              Tokenization: The preprocessed text is divided into individual tokens, which can be as small as words or even subword units. Tokenization helps in representing and processing the text efficiently during training.
+              Architecture Design: The model architecture, often based on the transformer architecture, is defined. Transformers are neural network models that excel in capturing long-range dependencies in sequential data, making them well-suited for language modeling tasks.
+              Model Initialization: The model parameters are randomly initialized to start the training process. These parameters will be adjusted iteratively during training to optimize the model's performance.
+              Training Loop: The model is trained using a large-scale computational infrastructure. The training loop typically involves several iterations over the dataset, known as epochs. During each epoch, the model processes the input data, generates predictions, and compares them with the expected output. The discrepancy between the predicted and expected output is used to compute a loss, which quantifies the model's performance.
+              Backpropagation and Optimization: Backpropagation is employed to calculate the gradients of the model's parameters with respect to the loss. These gradients indicate the direction and magnitude of the parameter updates needed to minimize the loss. Optimization algorithms, such as stochastic gradient descent (SGD) or its variants, are then used to update the model's parameters based on the computed gradients.
+              Iterative Refinement: Steps 6 and 7 are repeated for multiple epochs, gradually refining the model's performance. The model's ability to generate coherent and contextually relevant responses improves as it learns from the dataset.
+              Evaluation: The trained model is evaluated on a separate dataset to assess its performance and identify areas for improvement. Various metrics, such as perplexity or accuracy, can be used to evaluate the model's language generation capabilities.
+              Fine-tuning and Iteration: Based on the evaluation results, the model may undergo fine-tuning or further iterations of training to enhance its performance. This process helps in addressing specific limitations or biases and aligning the model's output more closely with desired expectations.
+              It's important to note that training a large language model from scratch is a computationally intensive process that requires substantial computational resources, including powerful hardware like GPUs or specialized hardware accelerators, and large-scale distributed systems to handle the massive amount of data and model parameters involved.
+              Here are ten highly recommended books that can help you learn deep learning:
+              "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville:
+              This comprehensive book covers the fundamental concepts of deep learning, including neural networks, optimization algorithms, and regularization techniques. It also explores advanced topics like generative models and deep reinforcement learning.
+              "Deep Learning with Python" by François Chollet:
+              Written by the creator of the Keras deep learning library, this book provides a practical introduction to deep learning with Python. It covers essential concepts, tools, and techniques, and includes hands-on examples and case studies.
+              "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow" by Aurélien Géron:
+              This book offers a hands-on approach to learning machine learning and deep learning using popular Python libraries such as Scikit-Learn, Keras, and TensorFlow. It covers various algorithms and provides practical examples and exercises.
+              "Deep Learning for Computer Vision" by Rajalingappaa Shanmugamani:
+              Focusing on deep learning techniques for computer vision tasks, this book explores topics such as convolutional neural networks (CNNs), image classification, object detection, and image generation. It includes code examples using Python and popular deep learning frameworks.
+              "Deep Learning: A Practitioner's Approach" by Josh Patterson and Adam Gibson:
+              This book offers a practical guide to implementing deep learning solutions using the Deeplearning4j library. It covers key concepts, architectures, and techniques, and includes code examples and case studies.
+              "Grokking Deep Learning" by Andrew Trask:
+              Geared towards beginners, this book provides an intuitive and accessible introduction to deep learning concepts. It covers neural networks, backpropagation, gradient descent, and other fundamental topics with clear explanations and visualizations.
+              "Deep Learning for Natural Language Processing" by Palash Goyal, Sumit Pandey, and Karan Jain:
+              Focusing on deep learning techniques for natural language processing (NLP), this book explores topics like word embeddings, recurrent neural networks (RNNs), and sequence-to-sequence models. It includes code examples using Python and popular NLP libraries.
+              "Deep Reinforcement Learning" by Pieter Abbeel and John Schulman:
+              This book provides an in-depth exploration of deep reinforcement learning, a subfield that combines deep learning with reinforcement learning. It covers topics like Q-learning, policy gradients, and deep Q-networks (DQNs) and provides practical examples.
+              "Deep Learning for Time Series Forecasting" by N.D. Lewis:
+              Focusing on deep learning techniques for time series data, this book covers topics such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks, and attention models. It includes code examples using Python and popular deep learning frameworks.
+              "Interpretable Deep Learning" by Christoph Molnar:
+              This book delves into the challenges and techniques for interpreting and understanding deep learning models. It covers model visualization, feature importance, and other methods for explaining and interpreting deep learning predictions.
+              These books cover a range of deep learning topics and provide valuable insights and practical guidance for learning and applying deep learning techniques. Choose the ones that align with your interests and learning style to enhance your understanding of deep learning.
+              Here are 10 popular GitHub projects that can be useful for building large language models (LLMs) or working with natural language processing (NLP) tasks:
+              TensorFlow: An open-source deep learning framework that provides tools and resources for building and training LLMs. It offers extensive support for various neural network architectures and has a large community.
+              PyTorch: Another popular deep learning framework that provides a dynamic computational graph and a wide range of tools for building LLMs. It is known for its user-friendly interface and flexibility.
+              Hugging Face Transformers: A library that provides pre-trained models and a high-level API for natural language understanding (NLU) tasks, including LLMs. It supports popular models like GPT, BERT, and RoBERTa.
+              Fairseq: A library developed by Facebook AI Research that focuses on sequence modeling tasks, including LLMs. It offers pre-trained models and tools for training and evaluating models using sequence-to-sequence architectures.
+              AllenNLP: A powerful NLP research library that simplifies the process of building and evaluating deep learning models. It offers pre-built components for common NLP tasks and supports LLMs with various architectures.
+              OpenAI GPT-3: Although not available on GitHub, OpenAI's GPT-3 language model is widely recognized and can be accessed via the OpenAI API. It offers state-of-the-art language generation capabilities and can be used for various NLP tasks.
+              BERT: A pre-trained language model developed by Google Research that has achieved exceptional results on various NLP benchmarks. The official implementation is available on GitHub and can be fine-tuned for specific tasks.
+              spaCy: A popular Python library for NLP tasks that provides efficient and scalable tools for tokenization, named entity recognition, part-of-speech tagging, and more. It integrates well with deep learning frameworks.
+              FastText: A library developed by Facebook Research that provides efficient tools for text classification and word representation learning. It offers pre-trained word embeddings and supports training LLMs for classification tasks.
+              NLTK (Natural Language Toolkit): A comprehensive library for NLP tasks in Python. It provides various modules for tokenization, stemming, tagging, parsing, and more. Although it doesn't focus explicitly on LLMs, it is widely used for preprocessing text data in NLP pipelines.
+              These projects offer a range of resources, pre-trained models, and tools that can assist you in building and working with large language models. Make sure to review the documentation and examples provided by each project to understand their capabilities and how they can be integrated into your workflow.
+              Here are some popular backend libraries that are commonly used for deep learning:
+              TensorFlow: Developed by Google's Brain Team, TensorFlow is one of the most widely used deep learning frameworks. It provides a flexible and comprehensive ecosystem for building and deploying machine learning models. TensorFlow offers high-level APIs for easy model construction, as well as lower-level APIs for fine-grained control. It supports distributed computing and has extensive community support.
+              PyTorch: Developed by Facebook's AI Research lab, PyTorch is known for its simplicity and dynamic computational graph. It allows for intuitive model construction and debugging. PyTorch is widely used in both research and industry due to its flexibility, support for dynamic networks, and strong GPU acceleration capabilities.
+              Keras: Initially developed as a user-friendly deep learning library, Keras is now integrated as the official high-level API in TensorFlow. It provides a user-friendly and modular interface for building neural networks. Keras abstracts away many complexities and allows users to build models with just a few lines of code. It supports multiple backends, including TensorFlow and Theano.
+              Theano: Although its development has been discontinued, Theano was one of the first widely-used deep learning libraries. It allows for efficient mathematical operations on multi-dimensional arrays and supports GPU acceleration. Theano was influential in shaping the deep learning landscape and served as a precursor to subsequent frameworks.
+              Caffe: Developed by the Berkeley Vision and Learning Center (BVLC), Caffe is a popular deep learning framework known for its efficiency and simplicity. It is particularly suitable for convolutional neural networks (CNNs) and image-related tasks. Caffe has a clean and expressive architecture description language that makes it easy to define and train deep models.
+              MXNet: MXNet is an open-source deep learning framework developed by Apache. It offers a flexible and efficient interface for building and deploying neural networks. MXNet provides a hybrid frontend that allows users to seamlessly switch between symbolic and imperative programming. It is known for its scalability and supports multiple programming languages.
+              Chainer: Chainer is a flexible deep learning framework that focuses on dynamic neural networks. It allows for intuitive model construction using imperative programming, making it easy to define complex architectures and manipulate data within the network. Chainer is known for its "define-by-run" approach, which facilitates dynamic computations.
+              Microsoft Cognitive Toolkit (CNTK): CNTK is a deep learning framework developed by Microsoft. It provides a highly efficient and scalable implementation of deep neural networks. CNTK supports both declarative and imperative programming models, making it suitable for both research and production-level deployments.
+              Deeplearning4j: Deeplearning4j is an open-source deep learning library that focuses on scalability and performance. It is designed to integrate with the Java ecosystem and supports distributed computing. Deeplearning4j provides tools for building various types of neural networks and offers integration with other popular libraries like Hadoop and Spark.
+              PaddlePaddle: PaddlePaddle (PArallel Distributed Deep LEarning) is a deep learning framework developed by Baidu. It emphasizes scalability and supports large-scale distributed training. PaddlePaddle provides a rich set of built-in models and algorithms, making it accessible to both beginners and advanced users.
+              Each of these backend libraries offers unique features, performance characteristics, and levels of abstraction. The choice of a backend library depends on factors such as your programming language preferences, the complexity of your models, the availability of community support, and the specific requirements of your deep learning project.
+              Here's an example code snippet that demonstrates how to create a GPT-Neox20B model using the Hugging Face Transformers library and start fine-tuning it with sample data from the '/tmp/wikitext' directory:
+  
+                import torch
+                from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+    
+                # Load the GPT-Neo model and tokenizer
+                model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+    
+                # Set the path to the training data
+                data_path = "/tmp/wikitext"
+    
+                # Define the dataset and data collator
+                dataset = TextDataset(tokenizer=tokenizer, file_path=data_path, block_size=128)
+                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    
+                # Define the training arguments
+                training_args = TrainingArguments(
+                    output_dir="./output_dir",  # Directory to save the fine-tuned model and logs
+                    overwrite_output_dir=True,
+                    num_train_epochs=3,  # Number of training epochs
+                    per_device_train_batch_size=8,
+                    save_steps=500,  # Number of updates steps before saving the model
+                    save_total_limit=2,  # Limit the total number of saved models
+                    prediction_loss_only=True,
+                )
+    
+                # Initialize the Trainer
+                trainer = Trainer(
+                    model=model,
+                    args=training_args,
+                    data_collator=data_collator,
+                    train_dataset=dataset,
+                )
+    
+                # Start fine-tuning
+                trainer.train()
+    
+                # Save the fine-tuned model
+                trainer.save_model("./fine-tuned-model")
+
+              To deploy the GPT-Neo model on an HTTP server, you can use a framework like Flask or FastAPI. Here's a basic example using Flask:
+
+                Install Flask:
+
+                pip install flask
+
+            Create a Python file, let's call it app.py, and import the necessary libraries:
+
+                from flask import Flask, request, jsonify
+                from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+                # Initialize Flask app
+                app = Flask(__name__)
+
+                # Load the pretrained GPT-Neo model and tokenizer
+                model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                Define an endpoint for model inference:
+
+                python
+                Copy code
+                @app.route("/generate", methods=["POST"])
+                def generate_text():
+                    data = request.get_json()
+                    input_text = data["input_text"]
+
+                    # Tokenize the input text
+                    input_ids = tokenizer.encode(input_text, return_tensors="pt")
+
+                    # Generate text with the model
+                    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
+                    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+
+                    return jsonify({"generated_text": generated_text})
+                Run the Flask app:
+
+                python
+                Copy code
+                if __name__ == "__main__":
+                    app.run(host="0.0.0.0", port=5000)
+            The code that includes the text generation sampling functions and provides a commented example:
+
+                import requests
+                import numpy as np
+
+                class TextGeneratorAPI:
+                    def __init__(self, server_url):
+                        self.server_url = server_url
+
+                    def generate_text(self, input_text, sampling_algorithm="greedy", temperature=0.7):
+                        url = f"{self.server_url}/generate"
+                        payload = {
+                            "input_text": input_text,
+                            "sampling_algorithm": sampling_algorithm,
+                            "temperature": temperature
+                        }
+                        response = requests.post(url, json=payload)
+                        generated_text = response.json()["generated_text"]
+                        return generated_text
+
+                    def greedy_sampling(self, logits):
+                        return np.argmax(logits)
+
+                    def random_sampling(self, logits):
+                        probabilities = np.exp(logits / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(len(logits), p=probabilities)
+
+                    def top_k_sampling(self, logits, k=10):
+                        indices = np.argsort(logits)[-k:]
+                        probabilities = np.exp(logits[indices] / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(indices, p=probabilities)
+
+                    def top_p_sampling(self, logits, p=0.9):
+                        sorted_logits = np.sort(logits)[::-1]
+                        cumulative_probs = np.cumsum(np.exp(sorted_logits) / temperature)
+                        indices = np.arange(len(sorted_logits))
+                        selected_indices = indices[cumulative_probs <= p]
+                        probabilities = np.exp(logits[selected_indices] / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(selected_indices, p=probabilities)
+                In this updated code, the TextGeneratorAPI class includes the additional sampling functions: greedy_sampling, random_sampling, top_k_sampling, and top_p_sampling. These functions take logits (output of the model) as input and return the index of the selected token based on the respective sampling algorithm.
+                The greedy_sampling function selects the token with the highest probability (argmax) as the next token. The random_sampling function applies a temperature scaling to the logits and then samples from the resulting probability distribution. The top_k_sampling function selects from the top-k tokens with the highest probabilities. The top_p_sampling function selects from the tokens with cumulative probabilities below a certain threshold (top-p).
+                You can now use the updated TextGeneratorAPI class with the sampling functions. Here's an example:
+
+                    api = TextGeneratorAPI(server_url="http://localhost:5000")
+
+                    input_text = "Once upon a time"
+
+                    # Generate text using different sampling algorithms and temperatures
+                    greedy_text = api.generate_text(input_text, sampling_algorithm="greedy")
+                    random_text = api.generate_text(input_text, sampling_algorithm="random")
+                    top_k_text = api.generate_text(input_text, sampling_algorithm="top_k", temperature=0.8)
+                    top_p_text = api.generate_text(input_text, sampling_algorithm="top_p", temperature=0.9)
+
+                    print("Greedy Sampling:", greedy_text)
+                    print("Random Sampling:", random_text)
+                    print("Top-k Sampling:", top_k_text)
+                    print("Top-p Sampling:", top_p_text)
+                    Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API.
+           """
diff --git a/benchmarks/inference/llm-bench/src/status.py b/benchmarks/inference/llm-bench/src/status.py
new file mode 100644
index 000000000..f0503bd80
--- /dev/null
+++ b/benchmarks/inference/llm-bench/src/status.py
@@ -0,0 +1,8 @@
+from enum import Enum
+
+class Status(str, Enum):
+    OK = "OK"
+    FAIL = "FAIL"
+
+    def __str__(self) -> str:
+        return self.value
\ No newline at end of file