Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: NVIDIA NIM on EKS Pattern #565

Merged
merged 17 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ai-ml/nvidia-triton-server/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nim-llm/
planfile
10 changes: 10 additions & 0 deletions ai-ml/nvidia-triton-server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.72 |
| <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.4.1 |
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 2.10 |
| <a name="provider_null"></a> [null](#provider\_null) | >= 3.1 |
| <a name="provider_random"></a> [random](#provider\_random) | >= 3.1 |
Expand All @@ -29,6 +30,7 @@
|------|--------|---------|
| <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.32.0 |
| <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
| <a name="module_efs"></a> [efs](#module\_efs) | terraform-aws-modules/efs/aws | ~> 1.6 |
| <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
| <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
| <a name="module_s3_bucket"></a> [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | 4.1.2 |
Expand All @@ -43,12 +45,17 @@
| [aws_iam_policy.triton](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
| [helm_release.nim_llm](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [kubernetes_annotations.disable_gp2](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
| [kubernetes_namespace.nim](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource |
| [kubernetes_namespace_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
| [kubernetes_persistent_volume_claim_v1.efs_pvc](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/persistent_volume_claim_v1) | resource |
| [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
| [kubernetes_secret_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
| [kubernetes_service_account_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource |
| [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
| [kubernetes_storage_class_v1.efs](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class_v1) | resource |
| [null_resource.download_nim_deploy](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [null_resource.sync_local_to_s3](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
| [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
Expand All @@ -65,8 +72,11 @@
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.30"` | no |
| <a name="input_enable_nvidia_nim"></a> [enable\_nvidia\_nim](#input\_enable\_nvidia\_nim) | Toggle to enable or disable NVIDIA NIM pattern resource creation | `bool` | `true` | no |
| <a name="input_enable_nvidia_triton_server"></a> [enable\_nvidia\_triton\_server](#input\_enable\_nvidia\_triton\_server) | Toggle to enable or disable NVIDIA Triton server resource creation | `bool` | `true` | no |
| <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
| <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"nvidia-triton-server"` | no |
| <a name="input_ngc_api_key"></a> [ngc\_api\_key](#input\_ngc\_api\_key) | NGC API Key | `string` | `"DUMMY_NGC_API_KEY_REPLACE_ME"` | no |
| <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |
| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br> "100.64.0.0/16"<br>]</pre> | no |
| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no |
Expand Down
19 changes: 19 additions & 0 deletions ai-ml/nvidia-triton-server/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ module "eks_blueprints_addons" {
vpc-cni = {}
}

#---------------------------------------
# AWS EFS CSI Add-on
#---------------------------------------
enable_aws_efs_csi_driver = true

#---------------------------------------
# AWS Load Balancer Controller Add-on
#---------------------------------------
Expand Down Expand Up @@ -140,6 +145,20 @@ module "eks_blueprints_addons" {
],
}

helm_releases = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

"prometheus-adapter" = {
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus-adapter"
namespace = module.eks_blueprints_addons.kube_prometheus_stack.namespace
version = "4.10.0"
values = [
templatefile(
"${path.module}/helm-values/prometheus-adapter.yaml", {}
hustshawn marked this conversation as resolved.
Show resolved Hide resolved
)
]
}
}

}

#---------------------------------------------------------------
Expand Down
40 changes: 16 additions & 24 deletions ai-ml/nvidia-triton-server/helm-values/kube-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,20 @@ alertmanager:

grafana:
enabled: true
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/default
defaultDashboardsEnabled: true
prometheus:
prometheusSpec:
retention: 5h
scrapeInterval: 30s
evaluationInterval: 30s
scrapeTimeout: 10s
serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
storageSpec:
volumeClaimTemplate:
metadata:
name: data
spec:
storageClassName: ${storage_class_type}
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
alertmanager:
enabled: false

grafana:
enabled: true
defaultDashboardsEnabled: true
dashboards:
hustshawn marked this conversation as resolved.
Show resolved Hide resolved
default:
nim-llm-monitoring:
url: https://raw.githubusercontent.com/hustshawn/nim-llm-eks/main/monitoring/nim-llm-monitoring.json
hustshawn marked this conversation as resolved.
Show resolved Hide resolved
53 changes: 53 additions & 0 deletions ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# ref: https://github.com/NVIDIA/nim-deploy/blob/main/helm/nim-llm/values.yaml
image:
repository: nvcr.io/nim/meta/llama3-8b-instruct
tag: latest
model:
ngcAPIKey: ${ngc_api_key}
nimCache: /model-store
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
statefulSet:
enabled: true
persistence:
enabled: true
existingClaim: ${pvc_name}
nodeSelector:
NodeGroupType: g5-gpu-karpenter
type: karpenter
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
metrics:
enabled: true
serviceMonitor:
enabled: true
additionalLabels:
release: prometheus
app: prometheus
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 5
scaleDownStabilizationSecs: 300
metrics:
- type: Pods
pods:
metric:
name: num_requests_running
target:
type: Value
averageValue: 5
ingress:
enabled: true
className: nginx
annotations: {}
hosts:
- paths:
- path: /
pathType: ImplementationSpecific
serviceType: openai
14 changes: 14 additions & 0 deletions ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-adapter/values.yaml
prometheus:
url: http://kube-prometheus-stack-prometheus.kube-prometheus-stack
port: 9090
rules:
default: false
custom:
- seriesQuery: '{__name__=~"num_requests_running"}'
resources:
template: <<.Resource>>
name:
matches: "num_requests_running"
as: ""
metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)
126 changes: 126 additions & 0 deletions ai-ml/nvidia-triton-server/nvidia-nim.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#---------------------------------------------------------------
# EFS
#---------------------------------------------------------------
module "efs" {
count = var.enable_nvidia_nim ? 1 : 0
source = "terraform-aws-modules/efs/aws"
version = "~> 1.6"

creation_token = local.name
name = local.name

# Mount targets / security group
mount_targets = {
for k, v in zipmap(local.azs, slice(module.vpc.private_subnets, length(module.vpc.private_subnets) - 2, length(module.vpc.private_subnets))) : k => { subnet_id = v }
}
security_group_description = "${local.name} EFS security group"
security_group_vpc_id = module.vpc.vpc_id
security_group_rules = {
vpc = {
# relying on the defaults provided for EFS/NFS (2049/TCP + ingress)
description = "NFS ingress from VPC private subnets"
cidr_blocks = module.vpc.private_subnets_cidr_blocks
}
}

tags = local.tags
}

resource "kubernetes_storage_class_v1" "efs" {
count = var.enable_nvidia_nim ? 1 : 0
metadata {
name = "efs"
}

storage_provisioner = "efs.csi.aws.com"
parameters = {
provisioningMode = "efs-ap" # Dynamic provisioning
fileSystemId = module.efs[count.index].id
directoryPerms = "777"
}

mount_options = [
"iam"
]

depends_on = [
module.eks_blueprints_addons.aws_efs_csi_driver
]
}

resource "kubernetes_namespace" "nim" {
count = var.enable_nvidia_nim ? 1 : 0
metadata {
name = "nim"
}

depends_on = [module.eks]
}

resource "kubernetes_persistent_volume_claim_v1" "efs_pvc" {
count = var.enable_nvidia_nim ? 1 : 0
metadata {
name = kubernetes_namespace.nim[count.index].metadata[0].name
namespace = "nim"
}
spec {
access_modes = ["ReadWriteMany"]
storage_class_name = kubernetes_storage_class_v1.efs[count.index].metadata[0].name
resources {
requests = {
storage = "100Gi"
}
}
}
}

#---------------------------------------------------------------
# NIM LLM Helm Chart
#---------------------------------------------------------------

hustshawn marked this conversation as resolved.
Show resolved Hide resolved
resource "null_resource" "download_nim_deploy" {
count = var.enable_nvidia_nim ? 1 : 0
# This trigger ensures the script runs only when the file doesn't exist
triggers = {
script_executed = fileexists("${path.module}/nim-llm/Chart.yaml") ? "false" : "true"
}

provisioner "local-exec" {
command = <<-EOT
if [ ! -d "${path.module}/nim-llm" ]; then
echo "Downloading nim-deploy repo ..."
TEMP_DIR=$(mktemp -d)
git clone https://github.com/NVIDIA/nim-deploy.git "$TEMP_DIR/nim-deploy"
cp -r "$TEMP_DIR/nim-deploy/helm/nim-llm" ${path.module}/nim-llm
rm -rf "$TEMP_DIR"
echo "Download completed."
else
echo "nim-llm directory already exists. Skipping download."
fi
EOT
}
}


resource "helm_release" "nim_llm" {
count = var.enable_nvidia_nim ? 1 : 0
name = "nim-llm"
chart = "${path.module}/nim-llm"
create_namespace = true
namespace = kubernetes_namespace.nim[count.index].metadata[0].name
timeout = 360
wait = false
values = [
templatefile(
"${path.module}/helm-values/nim-llm.yaml",
{
ngc_api_key = var.ngc_api_key
hustshawn marked this conversation as resolved.
Show resolved Hide resolved
pvc_name = kubernetes_persistent_volume_claim_v1.efs_pvc[count.index].metadata[0].name
}
)
]

depends_on = [
null_resource.download_nim_deploy
]
}
Loading
Loading