awslabs · vara-bonthu · Jul 11, 2024 · Jun 30, 2024 · Jun 30, 2024 · Jun 30, 2024
diff --git a/ai-ml/nvidia-triton-server/.gitignore b/ai-ml/nvidia-triton-server/.gitignore
@@ -0,0 +1,2 @@
+nim-llm/
+planfile
diff --git a/ai-ml/nvidia-triton-server/README.md b/ai-ml/nvidia-triton-server/README.md
@@ -19,6 +19,7 @@
 |------|---------|
 | <a name="provider_aws"></a> [aws](#provider\_aws) | >= 3.72 |
 | <a name="provider_aws.ecr"></a> [aws.ecr](#provider\_aws.ecr) | >= 3.72 |
+| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.4.1 |
 | <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 2.10 |
 | <a name="provider_null"></a> [null](#provider\_null) | >= 3.1 |
 | <a name="provider_random"></a> [random](#provider\_random) | >= 3.1 |
@@ -29,6 +30,7 @@
 |------|--------|---------|
 | <a name="module_data_addons"></a> [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.32.0 |
 | <a name="module_ebs_csi_driver_irsa"></a> [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
+| <a name="module_efs"></a> [efs](#module\_efs) | terraform-aws-modules/efs/aws | ~> 1.6 |
 | <a name="module_eks"></a> [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
 | <a name="module_eks_blueprints_addons"></a> [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
 | <a name="module_s3_bucket"></a> [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | 4.1.2 |
@@ -43,12 +45,17 @@
 | [aws_iam_policy.triton](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
 | [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource |
 | [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource |
+| [helm_release.nim_llm](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [kubernetes_annotations.disable_gp2](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
+| [kubernetes_namespace.nim](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource |
 | [kubernetes_namespace_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
+| [kubernetes_persistent_volume_claim_v1.efs_pvc](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/persistent_volume_claim_v1) | resource |
 | [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
 | [kubernetes_secret_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
 | [kubernetes_service_account_v1.triton](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource |
 | [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
+| [kubernetes_storage_class_v1.efs](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class_v1) | resource |
+| [null_resource.download_nim_deploy](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [null_resource.sync_local_to_s3](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
 | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
@@ -65,8 +72,11 @@
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.30"` | no |
+| <a name="input_enable_nvidia_nim"></a> [enable\_nvidia\_nim](#input\_enable\_nvidia\_nim) | Toggle to enable or disable NVIDIA NIM pattern resource creation | `bool` | `true` | no |
+| <a name="input_enable_nvidia_triton_server"></a> [enable\_nvidia\_triton\_server](#input\_enable\_nvidia\_triton\_server) | Toggle to enable or disable NVIDIA Triton server resource creation | `bool` | `true` | no |
 | <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"nvidia-triton-server"` | no |
+| <a name="input_ngc_api_key"></a> [ngc\_api\_key](#input\_ngc\_api\_key) | NGC API Key | `string` | `"DUMMY_NGC_API_KEY_REPLACE_ME"` | no |
 | <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |
 | <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br>  "100.64.0.0/16"<br>]</pre> | no |
 | <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no |

diff --git a/ai-ml/nvidia-triton-server/addons.tf b/ai-ml/nvidia-triton-server/addons.tf
@@ -78,6 +78,11 @@ module "eks_blueprints_addons" {
     vpc-cni = {}
   }
 
+  #---------------------------------------
+  # AWS EFS CSI Add-on
+  #---------------------------------------
+  enable_aws_efs_csi_driver = true
+
   #---------------------------------------
   # AWS Load Balancer Controller Add-on
   #---------------------------------------
@@ -140,6 +145,20 @@ module "eks_blueprints_addons" {
     ],
   }
 
+  helm_releases = {
+    "prometheus-adapter" = {
+      repository = "https://prometheus-community.github.io/helm-charts"
+      chart      = "prometheus-adapter"
+      namespace  = module.eks_blueprints_addons.kube_prometheus_stack.namespace
+      version    = "4.10.0"
+      values = [
+        templatefile(
+          "${path.module}/helm-values/prometheus-adapter.yaml", {}
+        )
+      ]
+    }
+  }
+
 }
 
 #---------------------------------------------------------------

diff --git a/ai-ml/nvidia-triton-server/helm-values/kube-prometheus.yaml b/ai-ml/nvidia-triton-server/helm-values/kube-prometheus.yaml
@@ -21,28 +21,20 @@ alertmanager:
 
 grafana:
   enabled: true
+  dashboardProviders:
+    dashboardproviders.yaml:
+      apiVersion: 1
+      providers:
+      - name: 'default'
+        orgId: 1
+        folder: ''
+        type: file
+        disableDeletion: false
+        editable: true
+        options:
+          path: /var/lib/grafana/dashboards/default
   defaultDashboardsEnabled: true
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-          - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
+  dashboards:
+    default:
+      nim-llm-monitoring:
+        url: https://raw.githubusercontent.com/hustshawn/nim-llm-eks/main/monitoring/nim-llm-monitoring.json
diff --git a/ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml b/ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml
@@ -0,0 +1,53 @@
+# ref: https://github.com/NVIDIA/nim-deploy/blob/main/helm/nim-llm/values.yaml
+image:
+  repository: nvcr.io/nim/meta/llama3-8b-instruct
+  tag: latest
+model:
+  ngcAPIKey: ${ngc_api_key}
+  nimCache: /model-store
+resources:
+  limits:
+    nvidia.com/gpu: 1
+  requests:
+    nvidia.com/gpu: 1
+statefulSet:
+  enabled: true
+persistence:
+  enabled: true
+  existingClaim: ${pvc_name}
+nodeSelector:
+  NodeGroupType: g5-gpu-karpenter
+  type: karpenter
+tolerations:
+- key: "nvidia.com/gpu"
+  operator: "Exists"
+  effect: "NoSchedule"
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    additionalLabels:
+      release: prometheus
+      app: prometheus
+autoscaling:
+  enabled: true
+  minReplicas: 1
+  maxReplicas: 5
+  scaleDownStabilizationSecs: 300
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: num_requests_running
+      target:
+        type: Value
+        averageValue: 5
+ingress:
+  enabled: true
+  className: nginx
+  annotations: {}
+  hosts:
+  - paths:
+    - path: /
+      pathType: ImplementationSpecific
+      serviceType: openai
diff --git a/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml b/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml
@@ -0,0 +1,14 @@
+# ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-adapter/values.yaml
+prometheus:
+  url: http://kube-prometheus-stack-prometheus.kube-prometheus-stack
+  port: 9090
+rules:
+  default: false
+  custom:
+  - seriesQuery: '{__name__=~"num_requests_running"}'
+    resources:
+      template: <<.Resource>>
+    name:
+      matches: "num_requests_running"
+      as: ""
+    metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)
diff --git a/ai-ml/nvidia-triton-server/nvidia-nim.tf b/ai-ml/nvidia-triton-server/nvidia-nim.tf
@@ -0,0 +1,126 @@
+#---------------------------------------------------------------
+# EFS
+#---------------------------------------------------------------
+module "efs" {
+  count   = var.enable_nvidia_nim ? 1 : 0
+  source  = "terraform-aws-modules/efs/aws"
+  version = "~> 1.6"
+
+  creation_token = local.name
+  name           = local.name
+
+  # Mount targets / security group
+  mount_targets = {
+    for k, v in zipmap(local.azs, slice(module.vpc.private_subnets, length(module.vpc.private_subnets) - 2, length(module.vpc.private_subnets))) : k => { subnet_id = v }
+  }
+  security_group_description = "${local.name} EFS security group"
+  security_group_vpc_id      = module.vpc.vpc_id
+  security_group_rules = {
+    vpc = {
+      # relying on the defaults provided for EFS/NFS (2049/TCP + ingress)
+      description = "NFS ingress from VPC private subnets"
+      cidr_blocks = module.vpc.private_subnets_cidr_blocks
+    }
+  }
+
+  tags = local.tags
+}
+
+resource "kubernetes_storage_class_v1" "efs" {
+  count = var.enable_nvidia_nim ? 1 : 0
+  metadata {
+    name = "efs"
+  }
+
+  storage_provisioner = "efs.csi.aws.com"
+  parameters = {
+    provisioningMode = "efs-ap" # Dynamic provisioning
+    fileSystemId     = module.efs[count.index].id
+    directoryPerms   = "777"
+  }
+
+  mount_options = [
+    "iam"
+  ]
+
+  depends_on = [
+    module.eks_blueprints_addons.aws_efs_csi_driver
+  ]
+}
+
+resource "kubernetes_namespace" "nim" {
+  count = var.enable_nvidia_nim ? 1 : 0
+  metadata {
+    name = "nim"
+  }
+
+  depends_on = [module.eks]
+}
+
+resource "kubernetes_persistent_volume_claim_v1" "efs_pvc" {
+  count = var.enable_nvidia_nim ? 1 : 0
+  metadata {
+    name      = kubernetes_namespace.nim[count.index].metadata[0].name
+    namespace = "nim"
+  }
+  spec {
+    access_modes       = ["ReadWriteMany"]
+    storage_class_name = kubernetes_storage_class_v1.efs[count.index].metadata[0].name
+    resources {
+      requests = {
+        storage = "100Gi"
+      }
+    }
+  }
+}
+
+#---------------------------------------------------------------
+# NIM LLM Helm Chart
+#---------------------------------------------------------------
+
+resource "null_resource" "download_nim_deploy" {
+  count = var.enable_nvidia_nim ? 1 : 0
+  # This trigger ensures the script runs only when the file doesn't exist
+  triggers = {
+    script_executed = fileexists("${path.module}/nim-llm/Chart.yaml") ? "false" : "true"
+  }
+
+  provisioner "local-exec" {
+    command = <<-EOT
+      if [ ! -d "${path.module}/nim-llm" ]; then
+        echo "Downloading nim-deploy repo ..."
+        TEMP_DIR=$(mktemp -d)
+        git clone https://github.com/NVIDIA/nim-deploy.git "$TEMP_DIR/nim-deploy"
+        cp -r "$TEMP_DIR/nim-deploy/helm/nim-llm" ${path.module}/nim-llm
+        rm -rf "$TEMP_DIR"
+        echo "Download completed."
+      else
+        echo "nim-llm directory already exists. Skipping download."
+      fi
+    EOT
+  }
+}
+
+
+resource "helm_release" "nim_llm" {
+  count            = var.enable_nvidia_nim ? 1 : 0
+  name             = "nim-llm"
+  chart            = "${path.module}/nim-llm"
+  create_namespace = true
+  namespace        = kubernetes_namespace.nim[count.index].metadata[0].name
+  timeout          = 360
+  wait             = false
+  values = [
+    templatefile(
+      "${path.module}/helm-values/nim-llm.yaml",
+      {
+        ngc_api_key = var.ngc_api_key
+        pvc_name    = kubernetes_persistent_volume_claim_v1.efs_pvc[count.index].metadata[0].name
+      }
+    )
+  ]
+
+  depends_on = [
+    null_resource.download_nim_deploy
+  ]
+}