From 0dcb8223e63215156d857f7d7b88a3132ec44691 Mon Sep 17 00:00:00 2001
From: takipipo <69394786+takipipo@users.noreply.github.com>
Date: Wed, 24 Apr 2024 17:28:20 +0700
Subject: [PATCH 1/3] feat: add bedrock client

---
 src/llmperf/ray_clients/bedrock_client.py | 91 +++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 src/llmperf/ray_clients/bedrock_client.py

diff --git a/src/llmperf/ray_clients/bedrock_client.py b/src/llmperf/ray_clients/bedrock_client.py
new file mode 100644
index 0000000..c37a05b
--- /dev/null
+++ b/src/llmperf/ray_clients/bedrock_client.py
@@ -0,0 +1,91 @@
+import io
+import json
+import os
+import time
+from typing import Any, Dict
+
+import boto3
+import ray
+import json
+
+from llmperf import common_metrics
+from llmperf.models import RequestConfig
+from llmperf.ray_llm_client import LLMClient
+
+
+@ray.remote
+class BedrockClient(LLMClient):
+    """Client for AWS Bedrock Foundation Model on Llama-2-13b-chat"""
+
+    def __init__(self):
+        # Sagemaker doesn't return the number of tokens that are generated so we approximate it by
+        # using the llama tokenizer.
+        # self.tokenizer = LlamaTokenizerFast.from_pretrained(
+        #     "hf-internal-testing/llama-tokenizer"
+        # )
+
+    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
+        if not os.environ.get("AWS_ACCESS_KEY_ID"):
+            raise ValueError("AWS_ACCESS_KEY_ID must be set.")
+        if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
+            raise ValueError("AWS_SECRET_ACCESS_KEY must be set.")
+        if not os.environ.get("AWS_REGION_NAME"):
+            raise ValueError("AWS_REGION_NAME must be set.")
+
+        prompt = request_config.prompt
+        prompt, _ = prompt
+        model = request_config.model
+
+        bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")
+        
+        sampling_params = request_config.sampling_params
+        
+        if "max_tokens" in sampling_params:
+            sampling_params["max_new_tokens"] = sampling_params["max_tokens"]
+            del sampling_params["max_tokens"]
+        
+        body = {
+            "prompt": prompt,
+            "temperature": 0.5,
+            "top_p": 0.9,
+            "max_gen_len": 512,
+        }
+        time_to_next_token = []
+        tokens_received = 0
+        ttft = 0
+        error_response_code = None
+        generated_text = ""
+        error_msg = ""
+        output_throughput = 0
+        total_request_time = 0
+        metrics = {}
+
+        start_time = time.monotonic()
+        most_recent_received_token_time = time.monotonic()
+        try:
+            response = bedrock_runtime.invoke_model(modelId="meta.llama2-13b-chat-v1", body = json.dumps(body))
+            total_request_time = time.monotonic() - start_time
+            
+            response_body = json.loads(response["body"].read())
+            tokens_received = response_body["generation_token_count"]
+            prompt_token = response_body["prompt_token_count"]
+            
+            output_throughput = tokens_received / total_request_time
+        
+        except Exception as e:
+            print(f"Warning Or Error: {e}")
+            print(error_response_code)
+            error_msg = str(e)
+            error_response_code = 500
+
+        metrics[common_metrics.ERROR_MSG] = error_msg
+        metrics[common_metrics.ERROR_CODE] = error_response_code
+        metrics[common_metrics.INTER_TOKEN_LAT] = 0
+        metrics[common_metrics.TTFT] = 0
+        metrics[common_metrics.E2E_LAT] = total_request_time
+        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
+        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_token
+        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
+        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_token
+
+        return metrics, generated_text, request_config
\ No newline at end of file

From 75123576ca3ef6714bfafba25fe146764b5393c3 Mon Sep 17 00:00:00 2001
From: takipipo <69394786+takipipo@users.noreply.github.com>
Date: Wed, 24 Apr 2024 17:34:22 +0700
Subject: [PATCH 2/3] feat: add bedrock choice in common.py

---
 src/llmperf/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llmperf/common.py b/src/llmperf/common.py
index 3efefa1..c7f908a 100644
--- a/src/llmperf/common.py
+++ b/src/llmperf/common.py
@@ -5,6 +5,7 @@
 )
 from llmperf.ray_clients.sagemaker_client import SageMakerClient
 from llmperf.ray_clients.vertexai_client import VertexAIClient
+from llmperf.ray_clients.bedrock_client import BedrockClient
 from llmperf.ray_llm_client import LLMClient
 
 
@@ -28,6 +29,8 @@ def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]:
         clients = [SageMakerClient.remote() for _ in range(num_clients)]
     elif llm_api == "vertexai":
         clients = [VertexAIClient.remote() for _ in range(num_clients)]
+    elif llm_api == "bedrock":
+        clients = [BedrockClient.remote() for _ in range(num_clients)]
     elif llm_api in SUPPORTED_APIS:
         clients = [LiteLLMClient.remote() for _ in range(num_clients)]
     else:

From 251a0b494f00e71613422beea128ebfebf54294d Mon Sep 17 00:00:00 2001
From: takipipo <69394786+takipipo@users.noreply.github.com>
Date: Thu, 25 Apr 2024 14:28:18 +0700
Subject: [PATCH 3/3] feat(bedrock): utilize max token len from request config

---
 .gitignore                                |  3 +-
 src/llmperf/common.py                     |  2 +-
 src/llmperf/ray_clients/bedrock_client.py | 47 +++++++++++------------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.gitignore b/.gitignore
index 54047ad..bd9cc94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -244,4 +244,5 @@ output/
 result_outputs/
 
 __pycache__
-**/__pycache__/
\ No newline at end of file
+**/__pycache__/
+env
\ No newline at end of file
diff --git a/src/llmperf/common.py b/src/llmperf/common.py
index c7f908a..fe5ef8f 100644
--- a/src/llmperf/common.py
+++ b/src/llmperf/common.py
@@ -3,7 +3,7 @@
 from llmperf.ray_clients.openai_chat_completions_client import (
     OpenAIChatCompletionsClient,
 )
-from llmperf.ray_clients.sagemaker_client import SageMakerClient
+# from llmperf.ray_clients.sagemaker_client import SageMakerClient
 from llmperf.ray_clients.vertexai_client import VertexAIClient
 from llmperf.ray_clients.bedrock_client import BedrockClient
 from llmperf.ray_llm_client import LLMClient
diff --git a/src/llmperf/ray_clients/bedrock_client.py b/src/llmperf/ray_clients/bedrock_client.py
index c37a05b..a6b91a1 100644
--- a/src/llmperf/ray_clients/bedrock_client.py
+++ b/src/llmperf/ray_clients/bedrock_client.py
@@ -3,6 +3,7 @@
 import os
 import time
 from typing import Any, Dict
+from pprint import pprint
 
 import boto3
 import ray
@@ -17,39 +18,33 @@
 class BedrockClient(LLMClient):
     """Client for AWS Bedrock Foundation Model on Llama-2-13b-chat"""
 
-    def __init__(self):
-        # Sagemaker doesn't return the number of tokens that are generated so we approximate it by
-        # using the llama tokenizer.
-        # self.tokenizer = LlamaTokenizerFast.from_pretrained(
-        #     "hf-internal-testing/llama-tokenizer"
-        # )
-
     def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
         if not os.environ.get("AWS_ACCESS_KEY_ID"):
             raise ValueError("AWS_ACCESS_KEY_ID must be set.")
         if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
             raise ValueError("AWS_SECRET_ACCESS_KEY must be set.")
-        if not os.environ.get("AWS_REGION_NAME"):
-            raise ValueError("AWS_REGION_NAME must be set.")
+        if not os.environ.get("AWS_REGION"):
+            raise ValueError("AWS_REGION must be set.")
 
         prompt = request_config.prompt
         prompt, _ = prompt
         model = request_config.model
 
-        bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")
-        
+        bedrock_runtime = boto3.client(
+            service_name="bedrock-runtime", region_name="us-west-2"
+        )
+
         sampling_params = request_config.sampling_params
-        
-        if "max_tokens" in sampling_params:
-            sampling_params["max_new_tokens"] = sampling_params["max_tokens"]
-            del sampling_params["max_tokens"]
-        
+
         body = {
             "prompt": prompt,
-            "temperature": 0.5,
-            "top_p": 0.9,
-            "max_gen_len": 512,
+            "max_gen_len": None,
         }
+        # use max gen length from sampling_params
+        if "max_tokens" in sampling_params:
+            body["max_gen_len"] = sampling_params["max_tokens"]
+            del sampling_params["max_tokens"]
+
         time_to_next_token = []
         tokens_received = 0
         ttft = 0
@@ -63,15 +58,19 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
         start_time = time.monotonic()
         most_recent_received_token_time = time.monotonic()
         try:
-            response = bedrock_runtime.invoke_model(modelId="meta.llama2-13b-chat-v1", body = json.dumps(body))
+            response = bedrock_runtime.invoke_model(
+                modelId="meta.llama2-13b-chat-v1", body=json.dumps(body)
+            )
             total_request_time = time.monotonic() - start_time
-            
+
             response_body = json.loads(response["body"].read())
             tokens_received = response_body["generation_token_count"]
             prompt_token = response_body["prompt_token_count"]
-            
+
+            pprint(response_body)
+
             output_throughput = tokens_received / total_request_time
-        
+
         except Exception as e:
             print(f"Warning Or Error: {e}")
             print(error_response_code)
@@ -88,4 +87,4 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
         metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
         metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_token
 
-        return metrics, generated_text, request_config
\ No newline at end of file
+        return metrics, generated_text, request_config