Skip to content

Commit

Permalink
Merge branch 'awslabs:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ovaleanu authored Apr 8, 2024
2 parents 7f6c690 + c50c181 commit fc248b6
Show file tree
Hide file tree
Showing 24 changed files with 567 additions and 45 deletions.
8 changes: 6 additions & 2 deletions ai-ml/jark-stack/terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@ Docs coming soon...
| Name | Version |
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.72 |
| <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 2.10 |

## Modules

| Name | Source | Version |
|------|--------|---------|
| <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.1 |
| <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.31.4 |
| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
Expand All @@ -41,13 +42,16 @@ Docs coming soon...
| [kubernetes_namespace_v1.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
| [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
| [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
| [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source |
| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no |
| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.29"` | no |
| <a name="input_enable_aws_efa_k8s_device_plugin"></a> [enable\_aws\_efa\_k8s\_device\_plugin](#input\_enable\_aws\_efa\_k8s\_device\_plugin) | Enable AWS EFA K8s Device Plugin | `bool` | `false` | no |
| <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"jark-stack"` | no |
| <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |
Expand Down
181 changes: 165 additions & 16 deletions ai-ml/jark-stack/terraform/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,49 @@ module "eks_blueprints_addons" {
values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
}

helm_releases = {
#---------------------------------------
# NVIDIA Device Plugin Add-on
#---------------------------------------
nvidia-device-plugin = {
description = "A Helm chart for NVIDIA Device Plugin"
namespace = "nvidia-device-plugin"
create_namespace = true
chart = "nvidia-device-plugin"
chart_version = "0.14.0"
repository = "https://nvidia.github.io/k8s-device-plugin"
values = [file("${path.module}/helm-values/nvidia-values.yaml")]
#---------------------------------------
# Karpenter Autoscaler for EKS Cluster
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
karpenter_node = {
iam_role_additional_policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
}
karpenter = {
chart_version = "v0.34.0"
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}

#---------------------------------------
# Argo Workflows & Argo Events
#---------------------------------------
enable_argo_workflows = true
argo_workflows = {
name = "argo-workflows"
namespace = "argo-workflows"
repository = "https://argoproj.github.io/argo-helm"
values = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})]
}

enable_argo_events = true
argo_events = {
name = "argo-events"
namespace = "argo-events"
repository = "https://argoproj.github.io/argo-helm"
values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})]
}

}

#---------------------------------------------------------------
# Data on EKS Kubernetes Addons
#---------------------------------------------------------------
module "data_addons" {
source = "aws-ia/eks-data-addons/aws"
version = "~> 1.1" # ensure to update this to the latest/desired version
version = "~> 1.31.4" # ensure to update this to the latest/desired version

oidc_provider_arn = module.eks.oidc_provider_arn

Expand All @@ -140,19 +161,147 @@ module "data_addons" {
values = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
}

enable_volcano = true
#---------------------------------------
# Kuberay Operator
#---------------------------------------
enable_kuberay_operator = true
kuberay_operator_helm_config = {
version = "1.1.0"
# Enabling Volcano as Batch scheduler for KubeRay Operator
values = [
<<-EOT
batchScheduler:
enabled: true
EOT
]
}

#---------------------------------------------------------------
# KubeRay Operator Add-on
# NVIDIA Device Plugin Add-on
#---------------------------------------------------------------
enable_kuberay_operator = true
enable_nvidia_device_plugin = true
nvidia_device_plugin_helm_config = {
version = "v0.14.5"
name = "nvidia-device-plugin"
values = [
<<-EOT
gfd:
enabled: true
nfd:
worker:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- operator: "Exists"
EOT
]
}

#---------------------------------------
# EFA Device Plugin Add-on
#---------------------------------------
enable_aws_efa_k8s_device_plugin = true
# IMPORTANT: Enable EFA only on nodes with EFA devices attached.
# Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling.
enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin
aws_efa_k8s_device_plugin_helm_config = {
values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")]
}

#---------------------------------------------------------------
# Karpenter Resources Add-on
#---------------------------------------------------------------
enable_karpenter_resources = true
karpenter_resources_helm_config = {
g5-gpu-karpenter = {
values = [
<<-EOT
name: g5-gpu-karpenter
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[2]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
nodePool:
labels:
- type: karpenter
- NodeGroupType: g5-gpu-karpenter
taints:
- key: nvidia.com/gpu
value: "Exists"
effect: "NoSchedule"
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["g5"]
- key: "karpenter.k8s.aws/instance-size"
operator: In
values: [ "2xlarge", "4xlarge", "8xlarge"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 180s
expireAfter: 720h
weight: 100
EOT
]
}
x86-cpu-karpenter = {
values = [
<<-EOT
name: x86-cpu-karpenter
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
subnetSelectorTerms:
id: ${module.vpc.private_subnets[3]}
securityGroupSelectorTerms:
tags:
Name: ${module.eks.cluster_name}-node
instanceStorePolicy: RAID0
nodePool:
labels:
- type: karpenter
- NodeGroupType: x86-cpu-karpenter
requirements:
- key: "karpenter.k8s.aws/instance-family"
operator: In
values: ["m5"]
- key: "karpenter.k8s.aws/instance-size"
operator: In
values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 180s
expireAfter: 720h
weight: 100
EOT
]
}
}

depends_on = [
kubernetes_secret_v1.huggingface_token,
kubernetes_config_map_v1.notebook
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# https://hub.docker.com/layers/rayproject/ray-ml/2.10.0-py310-gpu/images/sha256-4181ed53b0b25a758b155312ca6ab29a65cb78cd57296d42cfbe4806a2b77df4?context=explore
# docker buildx build --platform=linux/amd64 -t ray2.10.0-py310-gpu-stablediffusion:v1.0 -f Dockerfile .

# Use Ray base image
FROM rayproject/ray-ml:2.10.0-py310-gpu

# Maintainer label
LABEL maintainer="DoEKS"

# Set environment variables to non-interactive (this prevents some prompts)
ENV DEBIAN_FRONTEND=non-interactive

# Switch back to a non-root user for the subsequent commands
USER $USER

# Install Ray Serve and other Python packages with specific versions
RUN pip install --no-cache-dir requests torch "diffusers==0.12.1" "transformers=4.25.1"

# Set a working directory
WORKDIR /serve_app

# Copy your Ray Serve script into the container
COPY ray_serve_sd.py /serve_app/ray_serve_sd.py

# Set the PYTHONPATH environment variable
ENV PYTHONPATH=/serve_app:$PYTHONPATH
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Use Python base image
FROM --platform=linux/amd64 python:3.9-slim

# Set working directory in the container
WORKDIR /app

# Copy the Python script into the container
COPY gradio-app.py /app/gradio-app.py

RUN pip install --no-cache-dir gradio requests Pillow

# Command to run the Python script
ENTRYPOINT ["python", "gradio-app.py"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import gradio as gr
import requests
import os
from PIL import Image
from io import BytesIO

# Constants for model endpoint and service name
model_endpoint = os.environ.get("MODEL_ENDPOINT", "/imagine")
service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")

# Function to generate image based on prompt
def generate_image(prompt):

# Create the URL for the inference
url = f"{service_name}{model_endpoint}"

try:
# Send the request to the model service
response = requests.get(url, params={"prompt": prompt}, timeout=180)
response.raise_for_status() # Raise an exception for HTTP errors
i = Image.open(BytesIO(response.content))
return i

except requests.exceptions.RequestException as e:
# Handle any request exceptions (e.g., connection errors)
# return f"AI: Error: {str(e)}"
return Image.new('RGB', (100, 100), color='red')

# Define the Gradio PromptInterface
demo = gr.Interface(fn=generate_image,
inputs = [gr.Textbox(label="Enter the Prompt")],
outputs = gr.Image(type='pil')).launch(server_name="0.0.0.0")
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
apiVersion: v1
kind: Namespace
metadata:
name: gradio
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gradio-deployment
namespace: gradio
labels:
app: gradio
spec:
replicas: 1
selector:
matchLabels:
app: gradio
template:
metadata:
labels:
app: gradio
spec:
containers:
- name: gradio
# Update this image to the Gradio app image you want to deploy
image: public.ecr.aws/data-on-eks/gradio-app:sd-v1.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 7860
resources:
requests:
cpu: "512m"
memory: "2048Mi"
limits:
cpu: "1"
memory: "4096Mi"
env:
- name: MODEL_ENDPOINT
value: "/imagine"
# Please note that the service name is currently hardcoded to match the Stable Diffusion service for this blueprint.
# If there are any updates or changes to the actual RayServe deployment, you'll need to update the service name in this code accordingly.
- name: SERVICE_NAME
value: "http://stablediffusion-service.stablediffusion.svc.cluster.local:8000"
---
apiVersion: v1
kind: Service
metadata:
name: gradio-service
namespace: gradio
spec:
selector:
app: gradio
ports:
- name: http
protocol: TCP
port: 7860
targetPort: 7860
type: ClusterIP
Loading

0 comments on commit fc248b6

Please sign in to comment.