Added vLLM-Inferentia example to terraform ec2-examples

aws-ia · Dec 28, 2024 · 1fa34e9 · 1fa34e9
1 parent 77cbfd0
commit 1fa34e9
Show file tree

Hide file tree

Showing 9 changed files with 3,448 additions and 3,440 deletions.
diff --git a/cdk/examples/generative_ai_rag/assets/reinvent.txt b/cdk/examples/generative_ai_rag/assets/reinvent.txt
diff --git a/cdk/examples/generative_ai_rag/web-app/pages/rag_integration.py b/cdk/examples/generative_ai_rag/web-app/pages/rag_integration.py
@@ -61,7 +61,7 @@ def get_prompt_template(retrieved_passages: List[str]) -> str:
     return f"""
     You are an AI assistant answering questions about AWS re:Invent 2024 session information and general queries.
     Your task is to analyze the user's question, categorize it, and provide an appropriate response.
-    Every session information includes the following fields:    
+    Every session information includes the following fields:
     - Title
     - Session Code
     - Description
@@ -83,12 +83,12 @@ def get_prompt_template(retrieved_passages: List[str]) -> str:
     2. REINVENT_INFORMATION
         - This question type is for questions requesting information about specific sessions
         - This question type is for questions requesting information about sessions held at certain venue and times
-        
+
     3. REINVENT_RECOMMENDATION
         - This question type is for session recommendation questions for specific topics or interests
 
-    Second, analyze the user's question and provide a response based on the question type. 
-    
+    Second, analyze the user's question and provide a response based on the question type.
+
     1. GENERAL
         - Ignore the content in the retrieved passages.
         - Provide a direct answer to the question based on your general knowledge.
@@ -117,12 +117,12 @@ def get_prompt_template(retrieved_passages: List[str]) -> str:
 
     IMPORTANT:
     - Always base your answers on the provided data and refrain from offering uncertain information.
-    - Your final response should only contain the actual answer to the user's question. 
-    - Do not include any explanation of your thought process, categorization, or analysis in the final response. 
+    - Your final response should only contain the actual answer to the user's question.
+    - Do not include any explanation of your thought process, categorization, or analysis in the final response.
     - If retrieved passages are empty and question type is not GENERAL, respond with "Sorry. I couldn't find any related information."
     - Do not modify fields data in the retrieved passages.
     - If all conditions are not met, recommend similar sessions and be sure to explain the reason.
-    
+
     CRITICAL RESPONSE FORMAT:
     - You MUST format your entire response EXACTLY as follows, with no exceptions:
 
@@ -147,7 +147,7 @@ def get_prompt_template(retrieved_passages: List[str]) -> str:
     [/QUESTION_TYPE]
     [RESPONSE]
     Based on your question, I recommend the following session:
-    
+
     1. Responsible generative AI tabletop: Governance and oversight [REPEAT]
         - Session Code: GHJ208-R1
         - Session Type: Gamified learning
@@ -195,14 +195,14 @@ def main():
         if not knowledge_base_id:
             st.info("Something is wrong with parameter store")
             st.stop()
-        
+
         agent_client = boto3.client('bedrock-agent-runtime')
         bedrock_runtime_client = boto3.client('bedrock-runtime')
 
         try:
             # Retrieve relevant passages from the knowledge base
             retrieved_results = retrieve_from_knowledge_base(agent_client, knowledge_base_id, prompt)
-            
+
             # Extract and format the retrieved passages
             retrieved_passages = [result['content']['text'] for result in retrieved_results]
             formatted_passages = "\n\n".join(f"Passage {i+1}:\n{passage}" for i, passage in enumerate(retrieved_passages))
@@ -213,7 +213,7 @@ def main():
             response_started = False
 
             message_placeholder = st.chat_message("assistant").empty()
-    
+
             # Generate the final response using the invoke_model API
             system_prompt = get_prompt_template(formatted_passages)
 
@@ -230,7 +230,7 @@ def main():
                 elif response_started:
                     # If we're past the [RESPONSE] tag, continue accumulating the response content
                     response_content += chunk
-                
+
                 # Display the response content if we've started collecting it
                 if response_started:
                     # Remove the [/RESPONSE] tag if present and display the content
@@ -240,14 +240,14 @@ def main():
             import re
             question_type_match = re.search(r'\[QUESTION_TYPE\](.*?)\[/QUESTION_TYPE\]', full_response, re.DOTALL)
             response_match = re.search(r'\[RESPONSE\](.*?)\[/RESPONSE\]', full_response, re.DOTALL)
-            
+
             question_type = question_type_match.group(1).strip() if question_type_match else "UNKNOWN"
             final_response = response_match.group(1).strip() if response_match else "I apologize. There was an issue generating an appropriate response."
 
             message_placeholder.markdown(final_response)
-            
+
             st.session_state.messages.append({"role": "assistant", "content": final_response})
-                
+
             # Display citations only for non-general questions
             if question_type not in ["GENERAL", "UNKNOWN"]:
                 with st.expander("Data Sources"):
@@ -264,10 +264,10 @@ def main():
                 st.warning("Session has expired. Starting a new session. Please enter your question again.")
             else:
                 st.error("An error occurred while processing the response. Please check the logs for details.")
-            
+
             msg = "I encountered an issue while processing the response. Could you please rephrase your prompt or try a different question?"
             st.session_state.messages.append({"role": "assistant", "content": msg})
             st.chat_message("assistant").write(msg)
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/cdk/examples/other_stack/bedrock_stack.py b/cdk/examples/other_stack/bedrock_stack.py
@@ -46,7 +46,7 @@ def __init__(
         self.knowledge_base_data_source = bedrock.S3DataSource(self, 'KnowledgeBaseDataSource',
             bucket=self.bucket,
             knowledge_base=self.knowledge_base,
-            data_source_name='ReinventSessionInformationText', 
+            data_source_name='ReinventSessionInformationText',
             chunking_strategy= bedrock.ChunkingStrategy.hierarchical(
                 overlap_tokens=60,
                 max_parent_token_size=1500,

diff --git a/terraform/ec2-examples/vllm-inferentia/Dockerfile b/terraform/ec2-examples/vllm-inferentia/Dockerfile
@@ -16,4 +16,4 @@ RUN sed -i "/parser.add_argument('--block-size',/ {N;N;N;N;N;s/\[8, 16, 32\]/[8,
 RUN python3 -m pip install ray
 RUN pip install -U  triton>=3.0.0
 # Set the entry point
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/terraform/ec2-examples/vllm-inferentia/README.md b/terraform/ec2-examples/vllm-inferentia/README.md
@@ -58,7 +58,7 @@ ssh -i your-key.pem ec2-user@<ec2-public-ip>
 ```bash
 export ECR_REPO_NAME=vllm-neuron
 export AWS_REGION=us-west-2
-export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) 
+export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
 ```
 
 #### 3. Create an ECR Repository

diff --git a/terraform/ec2-examples/vllm-inferentia/main.tf b/terraform/ec2-examples/vllm-inferentia/main.tf
@@ -1,14 +1,13 @@
 provider "aws" {
   region = local.region
 }
-data "aws_caller_identity" "current" {}
 locals {
-  name                    = "ecs-demo-vllm-inferentia"    # Defaul name of the project
-  region                  = "us-west-2"                   # Default region
-  instance_type           = "inf2.8xlarge"                # Default instance size - if you change this - you will need to modify the cpu/memory details in the task definition
-  vllm_container_image    = "<ECR IMAGE URI>"             # ECR Image URI you created when building and pushing your image
-  hugging_face_api_key    = "<YOUR HUGGIN FACE API KEY>"  # Your Hugging Face API Key
-  user_data = <<-EOT
+  name                 = "ecs-demo-vllm-inferentia"   # Defaul name of the project
+  region               = "us-west-2"                  # Default region
+  instance_type        = "inf2.8xlarge"               # Default instance size - if you change this - you will need to modify the cpu/memory details in the task definition
+  vllm_container_image = "<ECR IMAGE URI>"            # ECR Image URI you created when building and pushing your image
+  hugging_face_api_key = "<YOUR HUGGIN FACE API KEY>" # Your Hugging Face API Key
+  user_data            = <<-EOT
     #!/bin/bash
     cat <<'EOF' >> /etc/ecs/ecs.config
     ECS_CLUSTER=${local.name}
@@ -33,8 +32,8 @@ locals {
 ################################################################################
 
 module "alb_sg" {
-  source  = "terraform-aws-modules/security-group/aws"
-  version = "~> 4.0"
+  source      = "terraform-aws-modules/security-group/aws"
+  version     = "~> 4.0"
   name        = "${local.name}-alb"
   description = "Security group for ALB"
   vpc_id      = data.aws_vpc.core_infra.id
@@ -55,15 +54,15 @@ module "alb_sg" {
 # ECS Task / Autoscaling Security Group
 ################################################################################
 module "autoscaling_sg" {
-  source  = "terraform-aws-modules/security-group/aws"
-  version = "~> 4.0"
+  source      = "terraform-aws-modules/security-group/aws"
+  version     = "~> 4.0"
   name        = "${local.name}-ecs-tasks"
   description = "Autoscaling group security group"
   vpc_id      = data.aws_vpc.core_infra.id
   ingress_with_source_security_group_id = [
     {
-      from_port = 8000
-      to_port   = 8000
+      from_port                = 8000
+      to_port                  = 8000
       protocol                 = "tcp"
       description              = "Allow traffic from ALB"
       source_security_group_id = module.alb_sg.security_group_id
@@ -78,8 +77,8 @@ module "autoscaling_sg" {
 ################################################################################
 # Cluster Config
 module "ecs_cluster" {
-  source  = "terraform-aws-modules/ecs/aws//modules/cluster"
-  version = "~> 5.0"
+  source       = "terraform-aws-modules/ecs/aws//modules/cluster"
+  version      = "~> 5.0"
   cluster_name = local.name
   # Capacity provider - autoscaling group
   default_capacity_provider_use_fargate = false
@@ -103,17 +102,17 @@ module "ecs_cluster" {
 
 # Austocaling Policy
 module "autoscaling" {
-  source  = "terraform-aws-modules/autoscaling/aws"
-  version = "~> 6.5"
-  name = "${local.name}-asg"
-  image_id      = jsondecode(data.aws_ssm_parameter.ecs_neuron_optimized_ami.value)["image_id"]
-  instance_type = local.instance_type
+  source                          = "terraform-aws-modules/autoscaling/aws"
+  version                         = "~> 6.5"
+  name                            = "${local.name}-asg"
+  image_id                        = jsondecode(data.aws_ssm_parameter.ecs_neuron_optimized_ami.value)["image_id"]
+  instance_type                   = local.instance_type
   security_groups                 = [module.autoscaling_sg.security_group_id]
   user_data                       = base64encode(local.user_data)
   ignore_desired_capacity_changes = true
-  create_iam_instance_profile = true
-  iam_role_name               = local.name
-  iam_role_description        = "ECS role for ${local.name}"
+  create_iam_instance_profile     = true
+  iam_role_name                   = local.name
+  iam_role_description            = "ECS role for ${local.name}"
   iam_role_policies = {
     AmazonEC2ContainerServiceforEC2Role = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
     AmazonSSMManagedInstanceCore        = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
@@ -129,9 +128,9 @@ module "autoscaling" {
   # Configure block device mapping
   block_device_mappings = [
     {
-      device_name = "/dev/xvda"       # Root volume device name
+      device_name = "/dev/xvda" # Root volume device name
       ebs = {
-        volume_size           = 100  # 100GB storage
+        volume_size           = 100   # 100GB storage
         volume_type           = "gp3" # General Purpose SSD (gp3 is recommended over gp2)
         delete_on_termination = true
         encrypted             = true
@@ -151,9 +150,9 @@ resource "aws_ecs_task_definition" "neuronx_vllm" {
   requires_compatibilities = ["EC2"]
   container_definitions = jsonencode([
     {
-      name  = "neuronx-vllm"
-      image = local.vllm_container_image
-      cpu   = 32768
+      name   = "neuronx-vllm"
+      image  = local.vllm_container_image
+      cpu    = 32768
       memory = 65536
       portMappings = [
         {
@@ -316,4 +315,4 @@ data "aws_vpc" "core_infra" {
     name   = "tag:Name"
     values = ["core-infra"]
   }
-}
+}
diff --git a/terraform/ec2-examples/vllm-inferentia/versions.tf b/terraform/ec2-examples/vllm-inferentia/versions.tf
@@ -0,0 +1,10 @@
+terraform {
+  required_version = ">= 1.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+}
diff --git a/terraform/modules/codebuild/README.md b/terraform/modules/codebuild/README.md
@@ -39,12 +39,12 @@ No modules.
 | <a name="input_create_iam_role"></a> [create\_iam\_role](#input\_create\_iam\_role) | Set this variable to true if you want to create a role for AWS DevOps Tools | `bool` | `false` | no |
 | <a name="input_description"></a> [description](#input\_description) | Short description of the project | `string` | `null` | no |
 | <a name="input_ecr_repository"></a> [ecr\_repository](#input\_ecr\_repository) | The ECR repositories to which grant IAM access | `string` | n/a | yes |
-| <a name="input_environment"></a> [environment](#input\_environment) | CodeBuild environment configuration details. At least one attribute is required since `environment` is a required by CodeBuild | `any` | <pre>{<br>  "image": "aws/codebuild/standard:4.0"<br>}</pre> | no |
+| <a name="input_environment"></a> [environment](#input\_environment) | CodeBuild environment configuration details. At least one attribute is required since `environment` is a required by CodeBuild | `any` | <pre>{<br/>  "image": "aws/codebuild/standard:4.0"<br/>}</pre> | no |
 | <a name="input_iam_role_name"></a> [iam\_role\_name](#input\_iam\_role\_name) | The name for the Role | `string` | n/a | yes |
 | <a name="input_iam_role_use_name_prefix"></a> [iam\_role\_use\_name\_prefix](#input\_iam\_role\_use\_name\_prefix) | Determines whether the IAM role name (`iam_role_name`) is used as a prefix | `bool` | `true` | no |
 | <a name="input_logs_config"></a> [logs\_config](#input\_logs\_config) | CodeBuild logs configuration details | `any` | `{}` | no |
 | <a name="input_name"></a> [name](#input\_name) | CodeBuild Project name | `string` | n/a | yes |
-| <a name="input_s3_bucket"></a> [s3\_bucket](#input\_s3\_bucket) | S3 bucket used for the artifact store | <pre>object({<br>    s3_bucket_id  = string<br>    s3_bucket_arn = string<br>  })</pre> | n/a | yes |
+| <a name="input_s3_bucket"></a> [s3\_bucket](#input\_s3\_bucket) | S3 bucket used for the artifact store | <pre>object({<br/>    s3_bucket_id  = string<br/>    s3_bucket_arn = string<br/>  })</pre> | n/a | yes |
 | <a name="input_service_role"></a> [service\_role](#input\_service\_role) | Amazon Resource Name (ARN) of the AWS Identity and Access Management (IAM) role that enables AWS CodeBuild to interact with dependent AWS services on behalf of the AWS account | `string` | n/a | yes |
 | <a name="input_tags"></a> [tags](#input\_tags) | A map of tags to add to all resources | `map(string)` | `{}` | no |
 

diff --git a/terraform/modules/codepipeline/README.md b/terraform/modules/codepipeline/README.md
@@ -35,13 +35,13 @@ No modules.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_code_build_projects"></a> [code\_build\_projects](#input\_code\_build\_projects) | The Code Build projects to which grant IAM access | `list(string)` | <pre>[<br>  "*"<br>]</pre> | no |
-| <a name="input_code_deploy_resources"></a> [code\_deploy\_resources](#input\_code\_deploy\_resources) | The Code Deploy applications and deployment groups to which grant IAM access | `list(string)` | <pre>[<br>  "*"<br>]</pre> | no |
+| <a name="input_code_build_projects"></a> [code\_build\_projects](#input\_code\_build\_projects) | The Code Build projects to which grant IAM access | `list(string)` | <pre>[<br/>  "*"<br/>]</pre> | no |
+| <a name="input_code_deploy_resources"></a> [code\_deploy\_resources](#input\_code\_deploy\_resources) | The Code Deploy applications and deployment groups to which grant IAM access | `list(string)` | <pre>[<br/>  "*"<br/>]</pre> | no |
 | <a name="input_create_iam_role"></a> [create\_iam\_role](#input\_create\_iam\_role) | Set this variable to true if you want to create a role for AWS DevOps Tools | `bool` | `false` | no |
 | <a name="input_iam_role_name"></a> [iam\_role\_name](#input\_iam\_role\_name) | The name for the Role | `string` | n/a | yes |
 | <a name="input_iam_role_use_name_prefix"></a> [iam\_role\_use\_name\_prefix](#input\_iam\_role\_use\_name\_prefix) | Determines whether the IAM role name (`iam_role_name`) is used as a prefix | `bool` | `true` | no |
 | <a name="input_name"></a> [name](#input\_name) | The CodePipeline pipeline name | `string` | n/a | yes |
-| <a name="input_s3_bucket"></a> [s3\_bucket](#input\_s3\_bucket) | S3 bucket used for the artifact store | <pre>object({<br>    s3_bucket_id  = string<br>    s3_bucket_arn = string<br>  })</pre> | n/a | yes |
+| <a name="input_s3_bucket"></a> [s3\_bucket](#input\_s3\_bucket) | S3 bucket used for the artifact store | <pre>object({<br/>    s3_bucket_id  = string<br/>    s3_bucket_arn = string<br/>  })</pre> | n/a | yes |
 | <a name="input_service_role"></a> [service\_role](#input\_service\_role) | Amazon Resource Name (ARN) of the AWS Identity and Access Management (IAM) role that enables AWS CodeBuild to interact with dependent AWS services on behalf of the AWS account | `string` | n/a | yes |
 | <a name="input_sns_topic"></a> [sns\_topic](#input\_sns\_topic) | The ARN of the SNS topic to use for pipline notifications | `string` | n/a | yes |
 | <a name="input_stage"></a> [stage](#input\_stage) | Codepipeline Stage Configuration | `any` | `{}` | no |