diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 48d04253a..aba839fcc 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.4.0
+ rev: v4.5.0
hooks:
- id: trailing-whitespace
args: ['--markdown-linebreak-ext=md']
@@ -10,7 +10,7 @@ repos:
- id: detect-aws-credentials
args: ['--allow-missing-credentials']
- repo: https://github.com/antonbabenko/pre-commit-terraform
- rev: v1.81.0
+ rev: v1.83.5
hooks:
- id: terraform_fmt
- id: terraform_docs
diff --git a/README.md b/README.md
index 34e72367b..e4b06c80b 100755
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
![Data on EKS](website/static/img/doeks-logo-green.png)
# [Data on Amazon EKS (DoEKS)](https://awslabs.github.io/data-on-eks/)
+(pronounce Do.eks)
+
[![plan-examples](https://github.com/awslabs/data-on-eks/actions/workflows/plan-examples.yml/badge.svg?branch=main)](https://github.com/awslabs/data-on-eks/actions/workflows/plan-examples.yml)
diff --git a/ai-ml/emr-spark-rapids/eks.tf b/ai-ml/emr-spark-rapids/eks.tf
index 02c60f12d..e1cf92e39 100644
--- a/ai-ml/emr-spark-rapids/eks.tf
+++ b/ai-ml/emr-spark-rapids/eks.tf
@@ -9,6 +9,7 @@ module "eks" {
cluster_name = local.name
cluster_version = var.eks_cluster_version
+ #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
vpc_id = module.vpc.vpc_id
diff --git a/ai-ml/jark-stack/terraform/README.md b/ai-ml/jark-stack/terraform/README.md
new file mode 100644
index 000000000..e7567f85a
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/README.md
@@ -0,0 +1,62 @@
+# JupyterHub, Argo, Ray, Kubernetes
+
+Docs coming soon...
+
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| [terraform](#requirement\_terraform) | >= 1.0.0 |
+| [aws](#requirement\_aws) | >= 3.72 |
+| [helm](#requirement\_helm) | >= 2.4.1 |
+| [http](#requirement\_http) | >= 3.3 |
+| [kubectl](#requirement\_kubectl) | >= 1.14 |
+| [kubernetes](#requirement\_kubernetes) | >= 2.10 |
+| [random](#requirement\_random) | >= 3.1 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| [aws](#provider\_aws) | >= 3.72 |
+| [kubernetes](#provider\_kubernetes) | >= 2.10 |
+
+## Modules
+
+| Name | Source | Version |
+|------|--------|---------|
+| [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.1 |
+| [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 |
+| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 |
+| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
+| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 |
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [kubernetes_annotations.disable_gp2](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
+| [kubernetes_config_map_v1.notebook](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map_v1) | resource |
+| [kubernetes_namespace_v1.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
+| [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
+| [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
+| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no |
+| [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
+| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"jark-stack"` | no |
+| [region](#input\_region) | region | `string` | `"us-west-2"` | no |
+| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no |
+| [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig |
+
diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf
new file mode 100644
index 000000000..3f2a91a2e
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/addons.tf
@@ -0,0 +1,186 @@
+#---------------------------------------------------------------
+# GP3 Encrypted Storage Class
+#---------------------------------------------------------------
+resource "kubernetes_annotations" "disable_gp2" {
+ annotations = {
+ "storageclass.kubernetes.io/is-default-class" : "false"
+ }
+ api_version = "storage.k8s.io/v1"
+ kind = "StorageClass"
+ metadata {
+ name = "gp2"
+ }
+ force = true
+
+ depends_on = [module.eks.eks_cluster_id]
+}
+
+resource "kubernetes_storage_class" "default_gp3" {
+ metadata {
+ name = "gp3"
+ annotations = {
+ "storageclass.kubernetes.io/is-default-class" : "true"
+ }
+ }
+
+ storage_provisioner = "ebs.csi.aws.com"
+ reclaim_policy = "Delete"
+ allow_volume_expansion = true
+ volume_binding_mode = "WaitForFirstConsumer"
+ parameters = {
+ fsType = "ext4"
+ encrypted = true
+ type = "gp3"
+ }
+
+ depends_on = [kubernetes_annotations.disable_gp2]
+}
+
+#---------------------------------------------------------------
+# IRSA for EBS CSI Driver
+#---------------------------------------------------------------
+module "ebs_csi_driver_irsa" {
+ source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+ version = "~> 5.20"
+ role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver")
+ attach_ebs_csi_policy = true
+ oidc_providers = {
+ main = {
+ provider_arn = module.eks.oidc_provider_arn
+ namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
+ }
+ }
+ tags = local.tags
+}
+
+#---------------------------------------------------------------
+# EKS Blueprints Addons
+#---------------------------------------------------------------
+module "eks_blueprints_addons" {
+ source = "aws-ia/eks-blueprints-addons/aws"
+ version = "~> 1.2"
+
+ cluster_name = module.eks.cluster_name
+ cluster_endpoint = module.eks.cluster_endpoint
+ cluster_version = module.eks.cluster_version
+ oidc_provider_arn = module.eks.oidc_provider_arn
+
+ #---------------------------------------
+ # Amazon EKS Managed Add-ons
+ #---------------------------------------
+ eks_addons = {
+ aws-ebs-csi-driver = {
+ service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
+ }
+ coredns = {
+ preserve = true
+ }
+ kube-proxy = {
+ preserve = true
+ }
+ # VPC CNI uses worker node IAM role policies
+ vpc-cni = {
+ preserve = true
+ }
+ }
+
+ #---------------------------------------
+ # AWS Load Balancer Controller Add-on
+ #---------------------------------------
+ enable_aws_load_balancer_controller = true
+ # turn off the mutating webhook for services because we are using
+ # service.beta.kubernetes.io/aws-load-balancer-type: external
+ aws_load_balancer_controller = {
+ set = [{
+ name = "enableServiceMutatorWebhook"
+ value = "false"
+ }]
+ }
+
+ #---------------------------------------
+ # Ingress Nginx Add-on
+ #---------------------------------------
+ enable_ingress_nginx = true
+ ingress_nginx = {
+ values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
+ }
+
+ helm_releases = {
+ #---------------------------------------
+ # NVIDIA Device Plugin Add-on
+ #---------------------------------------
+ nvidia-device-plugin = {
+ description = "A Helm chart for NVIDIA Device Plugin"
+ namespace = "nvidia-device-plugin"
+ create_namespace = true
+ chart = "nvidia-device-plugin"
+ chart_version = "0.14.0"
+ repository = "https://nvidia.github.io/k8s-device-plugin"
+ values = [file("${path.module}/helm-values/nvidia-values.yaml")]
+ }
+ }
+}
+
+#---------------------------------------------------------------
+# Data on EKS Kubernetes Addons
+#---------------------------------------------------------------
+module "data_addons" {
+ source = "aws-ia/eks-data-addons/aws"
+ version = "~> 1.1" # ensure to update this to the latest/desired version
+
+ oidc_provider_arn = module.eks.oidc_provider_arn
+
+ #---------------------------------------------------------------
+ # JupyterHub Add-on
+ #---------------------------------------------------------------
+ enable_jupyterhub = true
+ jupyterhub_helm_config = {
+ namespace = kubernetes_namespace_v1.jupyterhub.id
+ create_namespace = false
+ values = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
+ }
+
+ #---------------------------------------------------------------
+ # KubeRay Operator Add-on
+ #---------------------------------------------------------------
+ enable_kuberay_operator = true
+
+ depends_on = [
+ kubernetes_secret_v1.huggingface_token,
+ kubernetes_config_map_v1.notebook
+ ]
+}
+
+
+#---------------------------------------------------------------
+# Additional Resources
+#---------------------------------------------------------------
+
+resource "kubernetes_namespace_v1" "jupyterhub" {
+ metadata {
+ name = "jupyterhub"
+ }
+}
+
+
+resource "kubernetes_secret_v1" "huggingface_token" {
+ metadata {
+ name = "hf-token"
+ namespace = kubernetes_namespace_v1.jupyterhub.id
+ }
+
+ data = {
+ token = var.huggingface_token
+ }
+}
+
+resource "kubernetes_config_map_v1" "notebook" {
+ metadata {
+ name = "notebook"
+ namespace = kubernetes_namespace_v1.jupyterhub.id
+ }
+
+ data = {
+ "dogbooth.ipynb" = file("${path.module}/src/notebook/dogbooth.ipynb")
+ }
+}
diff --git a/ai-ml/jark-stack/terraform/cleanup.sh b/ai-ml/jark-stack/terraform/cleanup.sh
new file mode 100755
index 000000000..797c2de67
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/cleanup.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+read -p "Enter the region: " region
+export AWS_DEFAULT_REGION=$region
+
+echo "Destroying RayService..."
+
+# Delete the Ingress/SVC before removing the addons
+TMPFILE=$(mktemp)
+terraform -chdir=$SCRIPTDIR output -raw configure_kubectl > "$TMPFILE"
+# check if TMPFILE contains the string "No outputs found"
+if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then
+ echo "No outputs found, skipping kubectl delete"
+ source "$TMPFILE"
+ kubectl delete -f src/service/ray-service.yaml
+fi
+
+
+# List of Terraform modules to apply in sequence
+targets=(
+ "module.data_addons"
+ "module.eks_blueprints_addons"
+ "module.eks"
+ "module.vpc"
+)
+
+# Destroy modules in sequence
+for target in "${targets[@]}"
+do
+ echo "Destroying module $target..."
+ destroy_output=$(terraform destroy -target="$target" -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
+ if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
+ echo "SUCCESS: Terraform destroy of $target completed successfully"
+ else
+ echo "FAILED: Terraform destroy of $target failed"
+ exit 1
+ fi
+done
+
+echo "Destroying Load Balancers..."
+
+for arn in $(aws resourcegroupstaggingapi get-resources \
+ --resource-type-filters elasticloadbalancing:loadbalancer \
+ --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
+ --query 'ResourceTagMappingList[].ResourceARN' \
+ --output text); do \
+ aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \
+ done
+
+echo "Destroying Target Groups..."
+for arn in $(aws resourcegroupstaggingapi get-resources \
+ --resource-type-filters elasticloadbalancing:targetgroup \
+ --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
+ --query 'ResourceTagMappingList[].ResourceARN' \
+ --output text); do \
+ aws elbv2 delete-target-group --target-group-arn "$arn"; \
+ done
+
+echo "Destroying Security Groups..."
+for sg in $(aws ec2 describe-security-groups \
+ --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \
+ --query 'SecurityGroups[].GroupId' --output text); do \
+ aws ec2 delete-security-group --group-id "$sg"; \
+ done
+
+## Final destroy to catch any remaining resources
+echo "Destroying remaining resources..."
+destroy_output=$(terraform destroy -var="region=$region"-auto-approve 2>&1 | tee /dev/tty)
+if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
+ echo "SUCCESS: Terraform destroy of all modules completed successfully"
+else
+ echo "FAILED: Terraform destroy of all modules failed"
+ exit 1
+fi
diff --git a/ai-ml/jark-stack/terraform/eks.tf b/ai-ml/jark-stack/terraform/eks.tf
new file mode 100644
index 000000000..04c7fd409
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/eks.tf
@@ -0,0 +1,151 @@
+#---------------------------------------------------------------
+# EKS Cluster
+#---------------------------------------------------------------
+module "eks" {
+ source = "terraform-aws-modules/eks/aws"
+ version = "~> 19.15"
+
+ cluster_name = local.name
+ cluster_version = var.eks_cluster_version
+
+ # if true, Your cluster API server is accessible from the internet.
+ # You can, optionally, limit the CIDR blocks that can access the public endpoint.
+ #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
+ cluster_endpoint_public_access = true
+
+ vpc_id = module.vpc.vpc_id
+ # Filtering only Secondary CIDR private subnets starting with "100.".
+ # Subnet IDs where the EKS Control Plane ENIs will be created
+ subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+ substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
+
+ manage_aws_auth_configmap = true
+ aws_auth_roles = [
+ # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
+ {
+ rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn
+ username = "system:node:{{EC2PrivateDNSName}}"
+ groups = [
+ "system:bootstrappers",
+ "system:nodes",
+ ]
+ }
+ ]
+ #---------------------------------------
+ # Note: This can further restricted to specific required for each Add-on and your application
+ #---------------------------------------
+ # Extend cluster security group rules
+ cluster_security_group_additional_rules = {
+ ingress_nodes_ephemeral_ports_tcp = {
+ description = "Nodes on ephemeral ports"
+ protocol = "tcp"
+ from_port = 0
+ to_port = 65535
+ type = "ingress"
+ source_node_security_group = true
+ }
+ }
+
+ node_security_group_additional_rules = {
+ # Allows Control Plane Nodes to talk to Worker nodes on all ports.
+ # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
+ # This can be restricted further to specific port based on the requirement for each Add-on
+ # e.g., coreDNS 53, metrics-server 4443.
+ # Update this according to your security requirements if needed
+ ingress_cluster_to_node_all_traffic = {
+ description = "Cluster API to Nodegroup all traffic"
+ protocol = "-1"
+ from_port = 0
+ to_port = 0
+ type = "ingress"
+ source_cluster_security_group = true
+ }
+ }
+
+ eks_managed_node_group_defaults = {
+ iam_role_additional_policies = {
+ # Not required, but used in the example to access the nodes to inspect mounted volumes
+ AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+ }
+
+ ebs_optimized = true
+ # This block device is used only for root volume. Adjust volume according to your size.
+ # NOTE: Don't use this volume for ML workloads
+ block_device_mappings = {
+ xvda = {
+ device_name = "/dev/xvda"
+ ebs = {
+ volume_size = 100
+ volume_type = "gp3"
+ }
+ }
+ }
+ }
+
+ eks_managed_node_groups = {
+ # It's recommended to have a Managed Node group for hosting critical add-ons
+ # It's recommeded to use Karpenter to place your workloads instead of using Managed Node groups
+ # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
+ core_node_group = {
+ name = "core-node-group"
+ description = "EKS Core node group for hosting system add-ons"
+ # Filtering only Secondary CIDR private subnets starting with "100.".
+ # Subnet IDs where the nodes/node groups will be provisioned
+ subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+ substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+ )
+
+ # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2
+ ami_type = "AL2_x86_64" # Use this for Graviton AL2_ARM_64
+ min_size = 2
+ max_size = 8
+ desired_size = 2
+
+ instance_types = ["m5.xlarge"]
+
+ labels = {
+ WorkerType = "ON_DEMAND"
+ NodeGroupType = "core"
+ }
+
+ tags = merge(local.tags, {
+ Name = "core-node-grp"
+ })
+ }
+
+ # GPU Nodegroup for JupyterHub Notebook and Ray Service
+ gpu1 = {
+ name = "gpu-node-grp"
+ description = "EKS Node Group to run GPU workloads"
+ # Filtering only Secondary CIDR private subnets starting with "100.".
+ # Subnet IDs where the nodes/node groups will be provisioned
+ subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+ substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+ )
+
+ ami_type = "AL2_x86_64_GPU"
+ min_size = 1
+ max_size = 1
+ desired_size = 1
+
+ instance_types = ["g5.12xlarge"]
+
+ labels = {
+ WorkerType = "ON_DEMAND"
+ NodeGroupType = "gpu"
+ }
+
+ taints = {
+ gpu = {
+ key = "nvidia.com/gpu"
+ effect = "NO_SCHEDULE"
+ operator = "EXISTS"
+ }
+ }
+
+ tags = merge(local.tags, {
+ Name = "gpu-node-grp"
+ })
+ }
+ }
+}
diff --git a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml
new file mode 100644
index 000000000..c8b1a5d74
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml
@@ -0,0 +1,11 @@
+controller:
+ service:
+ externalTrafficPolicy: "Local"
+ annotations:
+ service.beta.kubernetes.io/aws-load-balancer-type: external
+ service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+ service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
+ service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
+ targetPorts:
+ http: http
+ https: http
diff --git a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml
new file mode 100644
index 000000000..fcad06b62
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml
@@ -0,0 +1,59 @@
+hub:
+ config:
+ Authenticator:
+ admin_users:
+ - admin1
+ allowed_users:
+ - user1
+ # testing only - do not do this for production
+ DummyAuthenticator:
+ password: never-do-this
+ JupyterHub:
+ authenticator_class: dummy
+proxy:
+ service:
+ annotations:
+ service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+ service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
+ service.beta.kubernetes.io/aws-load-balancer-type: external
+ service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+ service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+singleuser:
+ image:
+ name: public.ecr.aws/h3o5n2r0/gpu-jupyter
+ tag: v1.5_cuda-11.6_ubuntu-20.04_python-only
+ pullPolicy: Always
+ cmd: null
+ startTimeout: 600
+ memory:
+ guarantee: 24G
+ extraResource:
+ limits:
+ nvidia.com/gpu: "1"
+ extraEnv:
+ HUGGING_FACE_HUB_TOKEN:
+ valueFrom:
+ secretKeyRef:
+ name: hf-token
+ key: token
+ storage:
+ capacity: 100Gi
+ extraVolumes:
+ - name: shm-volume
+ emptyDir:
+ medium: Memory
+ - name: notebook
+ configMap:
+ name: notebook
+ extraVolumeMounts:
+ - name: shm-volume
+ mountPath: /dev/shm
+ - name: notebook
+ mountPath: /home/jovyan/dogbooth
+ extraTolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+scheduling:
+ userScheduler:
+ enabled: false
diff --git a/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml b/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml
new file mode 100644
index 000000000..9fa59599e
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml
@@ -0,0 +1,10 @@
+gfd:
+ enabled: true
+nfd:
+ enabled: true
+ worker:
+ tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+ - operator: "Exists"
diff --git a/ai-ml/jark-stack/terraform/install.sh b/ai-ml/jark-stack/terraform/install.sh
new file mode 100755
index 000000000..18f2a94d3
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/install.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+read -p "Enter the region: " region
+export AWS_DEFAULT_REGION=$region
+
+# List of Terraform modules to apply in sequence
+targets=(
+ "module.vpc"
+ "module.eks"
+)
+
+# Initialize Terraform
+terraform init -upgrade
+
+# Apply modules in sequence
+for target in "${targets[@]}"
+do
+ echo "Applying module $target..."
+ apply_output=$(terraform apply -target="$target" -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
+ if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+ echo "SUCCESS: Terraform apply of $target completed successfully"
+ else
+ echo "FAILED: Terraform apply of $target failed"
+ exit 1
+ fi
+done
+
+# Final apply to catch any remaining resources
+echo "Applying remaining resources..."
+apply_output=$(terraform apply -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
+if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+ echo "SUCCESS: Terraform apply of all modules completed successfully"
+else
+ echo "FAILED: Terraform apply of all modules failed"
+ exit 1
+fi
diff --git a/ai-ml/jark-stack/terraform/main.tf b/ai-ml/jark-stack/terraform/main.tf
new file mode 100644
index 000000000..bbbb966cd
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/main.tf
@@ -0,0 +1,38 @@
+provider "aws" {
+ region = local.region
+}
+
+provider "kubernetes" {
+ host = module.eks.cluster_endpoint
+ cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+ token = data.aws_eks_cluster_auth.this.token
+}
+
+provider "helm" {
+ kubernetes {
+ host = module.eks.cluster_endpoint
+ cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+ token = data.aws_eks_cluster_auth.this.token
+ }
+}
+provider "kubectl" {
+ apply_retry_count = 30
+ host = module.eks.cluster_endpoint
+ cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+ token = data.aws_eks_cluster_auth.this.token
+ load_config_file = false
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = module.eks.cluster_name
+}
+
+locals {
+ name = var.name
+ region = var.region
+ azs = ["${local.region}c", "${local.region}d"]
+ tags = {
+ Blueprint = local.name
+ GithubRepo = "github.com/awslabs/data-on-eks"
+ }
+}
diff --git a/ai-ml/jark-stack/terraform/outputs.tf b/ai-ml/jark-stack/terraform/outputs.tf
new file mode 100644
index 000000000..f6444daab
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/outputs.tf
@@ -0,0 +1,4 @@
+output "configure_kubectl" {
+ description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+ value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}"
+}
diff --git a/ai-ml/jark-stack/terraform/src/app/Dockerfile b/ai-ml/jark-stack/terraform/src/app/Dockerfile
new file mode 100644
index 000000000..afa9fc5ee
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/app/Dockerfile
@@ -0,0 +1,24 @@
+FROM python:3.8-slim
+
+RUN groupadd --gid 1000 appuser \
+ && useradd --uid 1000 --gid 1000 -ms /bin/bash appuser
+
+RUN pip3 install --no-cache-dir --upgrade \
+ pip \
+ virtualenv
+
+RUN apt-get update && apt-get install -y
+
+USER appuser
+WORKDIR /home/appuser
+
+ENV VIRTUAL_ENV=/home/appuser/venv
+RUN virtualenv ${VIRTUAL_ENV}
+RUN . ${VIRTUAL_ENV}/bin/activate && \
+ pip install requests streamlit Pillow
+
+EXPOSE 8501
+
+COPY streamlit.py /home/appuser/
+COPY run.sh /home/appuser
+ENTRYPOINT ["./run.sh"]
diff --git a/ai-ml/jark-stack/terraform/src/app/run.sh b/ai-ml/jark-stack/terraform/src/app/run.sh
new file mode 100644
index 000000000..0c52b8acc
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/app/run.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+APP_PID=
+stopRunningProcess() {
+ # Based on https://linuxconfig.org/how-to-propagate-a-signal-to-child-processes-from-a-bash-script
+ if test ! "${APP_PID}" = '' && ps -p ${APP_PID} > /dev/null ; then
+ > /proc/1/fd/1 echo "Stopping ${COMMAND_PATH} which is running with process ID ${APP_PID}"
+
+ kill -TERM ${APP_PID}
+ > /proc/1/fd/1 echo "Waiting for ${COMMAND_PATH} to process SIGTERM signal"
+
+ wait ${APP_PID}
+ > /proc/1/fd/1 echo "All processes have stopped running"
+ else
+ > /proc/1/fd/1 echo "${COMMAND_PATH} was not started when the signal was sent or it has already been stopped"
+ fi
+}
+
+trap stopRunningProcess EXIT TERM
+
+source ${VIRTUAL_ENV}/bin/activate
+
+streamlit run ${HOME}/streamlit_app.py &
+APP_ID=${!}
+
+wait ${APP_ID}
diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.py b/ai-ml/jark-stack/terraform/src/app/streamlit.py
new file mode 100644
index 000000000..2448e114e
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/app/streamlit.py
@@ -0,0 +1,33 @@
+import streamlit as st
+import requests
+from urllib.parse import urlencode
+from PIL import Image
+import tempfile
+
+
+### Update Hostname before building image
+base_url=""
+
+st.title("Welcome to dogbooth! :dog:")
+st.header("_a place to create images of [v]dog in beautiful scenes._")
+
+
+prompt = st.chat_input("a photo of a [v]dog ...")
+if prompt:
+ query_params = {
+ "prompt": prompt
+ }
+ encoded_query = urlencode(query_params)
+ image_url = f"{base_url}?{encoded_query}"
+
+ with st.spinner("Wait for it..."):
+ response = requests.get(image_url, timeout=180)
+
+ if response.status_code == 200:
+ content_size = len(response.content)
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+ f.write(response.content)
+ st.image(Image.open(f.name), caption=prompt)
+ st.balloons()
+ else:
+ st.error(f"Failed to download image. Status code: {response.status_code}")
diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.yaml b/ai-ml/jark-stack/terraform/src/app/streamlit.yaml
new file mode 100644
index 000000000..ae3fc0df5
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/app/streamlit.yaml
@@ -0,0 +1,83 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: dogbooth-app
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: streamlit-deployment
+ namespace: dogbooth-app
+ labels:
+ app: streamlit
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: streamlit
+ template:
+ metadata:
+ labels:
+ app: streamlit
+ spec:
+ containers:
+ - name: streamlit
+ image: public.ecr.aws/h3o5n2r0/gen-ai-demo/dogbooth-app:0.0.2
+ imagePullPolicy: Always
+ ports:
+ - containerPort: 8501
+ livenessProbe:
+ httpGet:
+ path: /_stcore/health
+ port: 8501
+ scheme: HTTP
+ timeoutSeconds: 1
+ readinessProbe:
+ httpGet:
+ path: /_stcore/health
+ port: 8501
+ scheme: HTTP
+ timeoutSeconds: 1
+ resources:
+ limits:
+ cpu: 1
+ memory: 2Gi
+ requests:
+ cpu: 100m
+ memory: 745Mi
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: streamlit-service
+ namespace: dogbooth-app
+spec:
+ type: ClusterIP
+ selector:
+ app: streamlit
+ ports:
+ - name: streamlit-port
+ protocol: TCP
+ port: 8501
+ targetPort: 8501
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: dogbooth-app
+ namespace: dogbooth-app
+ annotations:
+ nginx.ingress.kubernetes.io/rewrite-target: "/$1"
+spec:
+ ingressClassName: nginx
+ rules:
+ - http:
+ paths:
+ - path: /dogbooth/app/(.*)
+ pathType: ImplementationSpecific
+ backend:
+ service:
+ name: streamlit-service
+ port:
+ number: 8501
diff --git a/ai-ml/jark-stack/terraform/src/notebook/Dockerfile b/ai-ml/jark-stack/terraform/src/notebook/Dockerfile
new file mode 100644
index 000000000..da5c32dd7
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/notebook/Dockerfile
@@ -0,0 +1,10 @@
+FROM cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+
+USER root
+
+RUN conda install -c anaconda libstdcxx-ng
+
+RUN cp /opt/conda/lib/libstdc++.so.6.0.31 /usr/lib/x86_64-linux-gnu/ && \
+ cd /usr/lib/x86_64-linux-gnu && \
+ rm -f libstdc++.so.6 && \
+ ln -s libstdc++.so.6.0.31 libstdc++.so.6
diff --git a/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb b/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb
new file mode 100644
index 000000000..0e54206b9
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb
@@ -0,0 +1,275 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Verify NVIDIA GPU is visible\n",
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.chdir(\"/home/jovyan\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Clone the diffusers repo\n",
+ "!git clone https://github.com/huggingface/diffusers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Change the directory\n",
+ "os.chdir(\"diffusers\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Install requirements\n",
+ "! pip install -e .\n",
+ "! pip install xformers==0.0.16 diffusers[torch]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Fix for bitsandbytes https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md\n",
+ "! wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh\n",
+ "! bash cuda_install.sh 117 ~/local 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Install bitsandbytes for optimizations\n",
+ "! pip install bitsandbytes==0.41.0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Use the newly installed CUDA version for bitsandbytes\n",
+ "os.environ[\"BNB_CUDA_VERSION\"] = \"117\"\n",
+ "os.environ[\"LD_LIBRARY_PATH\"] = os.getenv(\"LD_LIBRARY_PATH\") + \":/home/jovyan/local/cuda-11.7\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Validate successful install of bitsandbytes\n",
+ "! python -m bitsandbytes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Install requirements for dreambooth\n",
+ "os.chdir(\"examples/dreambooth\")\n",
+ "! pip install -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Setup default configuration for accelerate\n",
+ "! accelerate config default"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Login to huggingface associated with your account (please create one if it doesn't exist)\n",
+ "! huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Download sample dataset of the subject. See the sample images here https://huggingface.co/datasets/diffusers/dog-example\n",
+ "from huggingface_hub import snapshot_download\n",
+ "\n",
+ "local_dir = \"./dog\"\n",
+ "snapshot_download(\n",
+ " \"diffusers/dog-example\",\n",
+ " local_dir=local_dir, repo_type=\"dataset\",\n",
+ " ignore_patterns=\".gitattributes\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Export environment variables to provide input model, dataset directory and output directory for the tuned model\n",
+ "os.environ[\"MODEL_NAME\"] = \"stabilityai/stable-diffusion-2-1\"\n",
+ "os.environ[\"INSTANCE_DIR\"] = \"dog\"\n",
+ "os.environ[\"OUTPUT_DIR\"] = \"dogbooth\"\n",
+ "os.environ[\"RESOLUTION\"] = \"768\"\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"garbage_collection_threshold:0.6,max_split_size_mb:128\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Launch the training and push the output model to huggingface\n",
+ "! accelerate launch train_dreambooth.py \\\n",
+ " --pretrained_model_name_or_path=$MODEL_NAME \\\n",
+ " --instance_data_dir=$INSTANCE_DIR \\\n",
+ " --output_dir=$OUTPUT_DIR \\\n",
+ " --instance_prompt=\"a photo of [v]dog\" \\\n",
+ " --resolution=768 \\\n",
+ " --train_batch_size=1 \\\n",
+ " --gradient_accumulation_steps=1 \\\n",
+ " --gradient_checkpointing \\\n",
+ " --learning_rate=1e-6 \\\n",
+ " --lr_scheduler=\"constant\" \\\n",
+ " --enable_xformers_memory_efficient_attention \\\n",
+ " --use_8bit_adam \\\n",
+ " --lr_warmup_steps=0 \\\n",
+ " --max_train_steps=800 \\\n",
+ " --push_to_hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "plaintext"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run a sample inference\n",
+ "from diffusers import StableDiffusionPipeline\n",
+ "import torch\n",
+ "\n",
+ "model_id = \"./dogbooth\"\n",
+ "pipe = StableDiffusionPipeline.from_pretrained(model_id).to(\"cuda\")\n",
+ "\n",
+ "prompt = \"a photo of [v]dog on the moon\"\n",
+ "image = pipe(prompt, num_inference_steps=100, guidance_scale=7.5).images[0]\n",
+ "\n",
+ "image.save(\"dog-bucket.png\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ai-ml/jark-stack/terraform/src/service/Dockerfile b/ai-ml/jark-stack/terraform/src/service/Dockerfile
new file mode 100644
index 000000000..1f348eaeb
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/service/Dockerfile
@@ -0,0 +1,8 @@
+FROM rayproject/ray-ml:2.6.0-gpu
+
+RUN pip install -U \
+ git+https://github.com/huggingface/transformers diffusers
+
+WORKDIR /serve_app
+
+COPY dogbooth.py /serve_app/dogbooth.py
diff --git a/ai-ml/jark-stack/terraform/src/service/dogbooth.py b/ai-ml/jark-stack/terraform/src/service/dogbooth.py
new file mode 100644
index 000000000..f3b42b474
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/service/dogbooth.py
@@ -0,0 +1,58 @@
+from io import BytesIO
+from fastapi import FastAPI
+from fastapi.responses import Response
+import torch
+import os
+from ray import serve
+
+
+app = FastAPI()
+
+
+@serve.deployment(num_replicas=1, route_prefix="/")
+@serve.ingress(app)
+class APIIngress:
+ def __init__(self, diffusion_model_handle) -> None:
+ self.handle = diffusion_model_handle
+
+ @app.get(
+ "/imagine",
+ responses={200: {"content": {"image/png": {}}}},
+ response_class=Response,
+ )
+ async def generate(self, prompt: str, img_size: int = 768):
+ assert len(prompt), "prompt parameter cannot be empty"
+
+ image_ref = await self.handle.generate.remote(prompt, img_size=img_size)
+ image = await image_ref
+ file_stream = BytesIO()
+ image.save(file_stream, "PNG")
+ return Response(content=file_stream.getvalue(), media_type="image/png")
+
+
+@serve.deployment(
+ ray_actor_options={"num_gpus": 1},
+ autoscaling_config={"min_replicas": 1, "max_replicas": 2},
+)
+class StableDiffusionV2:
+ def __init__(self):
+ from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline
+
+ model_id = os.getenv('MODEL_ID')
+
+ scheduler = EulerDiscreteScheduler.from_pretrained(
+ model_id, subfolder="scheduler"
+ )
+ self.pipe = StableDiffusionPipeline.from_pretrained(
+ model_id, scheduler=scheduler
+ )
+ self.pipe = self.pipe.to("cuda")
+
+ def generate(self, prompt: str, img_size: int = 768):
+ assert len(prompt), "prompt parameter cannot be empty"
+
+ image = self.pipe(prompt, height=img_size, width=img_size).images[0]
+ return image
+
+
+entrypoint = APIIngress.bind(StableDiffusionV2.bind())
diff --git a/ai-ml/jark-stack/terraform/src/service/ray-service.yaml b/ai-ml/jark-stack/terraform/src/service/ray-service.yaml
new file mode 100644
index 000000000..a72a2c904
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/src/service/ray-service.yaml
@@ -0,0 +1,99 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: dogbooth
+---
+apiVersion: ray.io/v1alpha1
+kind: RayService
+metadata:
+ name: dogbooth
+ namespace: dogbooth
+spec:
+ serviceUnhealthySecondThreshold: 600
+ deploymentUnhealthySecondThreshold: 600
+ serveConfig:
+ importPath: dogbooth:entrypoint
+ runtimeEnv: |
+ env_vars: {"MODEL_ID": "askulkarni2/dogbooth"}
+ rayClusterConfig:
+ rayVersion: '2.6.0'
+ headGroupSpec:
+ rayStartParams:
+ dashboard-host: '0.0.0.0'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: public.ecr.aws/h3o5n2r0/dogbooth:0.0.1-gpu
+ resources:
+ limits:
+ cpu: 2
+ memory: 16Gi
+ nvidia.com/gpu: 1
+ requests:
+ cpu: 2
+ memory: 16Gi
+ nvidia.com/gpu: 1
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ - containerPort: 8000
+ name: serve
+ workerGroupSpecs:
+ - replicas: 1
+ minReplicas: 1
+ maxReplicas: 5
+ rayStartParams: {}
+ groupName: small-group
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: public.ecr.aws/h3o5n2r0/dogbooth:0.0.1-gpu
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "2"
+ memory: "16Gi"
+ nvidia.com/gpu: 1
+ requests:
+ cpu: "2"
+ memory: "16Gi"
+ nvidia.com/gpu: 1
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: dogbooth
+ namespace: dogbooth
+ annotations:
+ nginx.ingress.kubernetes.io/rewrite-target: "/$1"
+spec:
+ ingressClassName: nginx
+ rules:
+ - http:
+ paths:
+ # Ray Dashboard
+ - path: /dogbooth/(.*)
+ pathType: ImplementationSpecific
+ backend:
+ service:
+ name: dogbooth-head-svc
+ port:
+ number: 8265
+ # Ray Serve
+ - path: /dogbooth/serve/(.*)
+ pathType: ImplementationSpecific
+ backend:
+ service:
+ name: dogbooth-head-svc
+ port:
+ number: 8000
diff --git a/ai-ml/jark-stack/terraform/variables.tf b/ai-ml/jark-stack/terraform/variables.tf
new file mode 100644
index 000000000..b95ca832d
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/variables.tf
@@ -0,0 +1,40 @@
+variable "name" {
+ description = "Name of the VPC and EKS Cluster"
+ default = "jark-stack"
+ type = string
+}
+
+# NOTE: Trainium and Inferentia are only available in us-west-2 and us-east-1 regions
+variable "region" {
+ description = "region"
+ default = "us-west-2"
+ type = string
+}
+
+variable "eks_cluster_version" {
+ description = "EKS Cluster version"
+ default = "1.27"
+ type = string
+}
+
+# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
+variable "vpc_cidr" {
+ description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range"
+ default = "10.1.0.0/21"
+ type = string
+}
+
+# RFC6598 range 100.64.0.0/10
+# Note you can only /16 range to VPC. You can add multiples of /16 if required
+variable "secondary_cidr_blocks" {
+ description = "Secondary CIDR blocks to be attached to VPC"
+ default = ["100.64.0.0/16"]
+ type = list(string)
+}
+
+variable "huggingface_token" {
+ description = "Hugging Face Secret Token"
+ type = string
+ default = "DUMMY_TOKEN_REPLACE_ME"
+ sensitive = true
+}
diff --git a/ai-ml/jark-stack/terraform/versions.tf b/ai-ml/jark-stack/terraform/versions.tf
new file mode 100644
index 000000000..bb085ae7a
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/versions.tf
@@ -0,0 +1,37 @@
+terraform {
+ required_version = ">= 1.0.0"
+
+ required_providers {
+ aws = {
+ source = "hashicorp/aws"
+ version = ">= 3.72"
+ }
+ kubernetes = {
+ source = "hashicorp/kubernetes"
+ version = ">= 2.10"
+ }
+ helm = {
+ source = "hashicorp/helm"
+ version = ">= 2.4.1"
+ }
+ kubectl = {
+ source = "gavinbunney/kubectl"
+ version = ">= 1.14"
+ }
+ random = {
+ source = "hashicorp/random"
+ version = ">= 3.1"
+ }
+ http = {
+ source = "hashicorp/http"
+ version = ">= 3.3"
+ }
+ }
+
+ # ## Used for end-to-end testing on project; update to suit your needs
+ # backend "s3" {
+ # bucket = "doeks-github-actions-e2e-test-state"
+ # region = "us-west-2"
+ # key = "e2e/trainium-inferentia/terraform.tfstate"
+ # }
+}
diff --git a/ai-ml/jark-stack/terraform/vpc.tf b/ai-ml/jark-stack/terraform/vpc.tf
new file mode 100644
index 000000000..59c3da89c
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/vpc.tf
@@ -0,0 +1,53 @@
+locals {
+ # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
+ # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
+ private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
+ # Routable Public subnets with NAT Gateway and Internet Gateway
+ # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
+ public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
+ # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
+ # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
+ secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
+}
+
+#---------------------------------------------------------------
+# VPC
+#---------------------------------------------------------------
+# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts.
+# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements
+
+module "vpc" {
+ source = "terraform-aws-modules/vpc/aws"
+ version = "~> 5.0"
+
+ name = local.name
+ cidr = var.vpc_cidr
+ azs = local.azs
+
+ # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
+ secondary_cidr_blocks = var.secondary_cidr_blocks
+
+ # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
+ # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
+ private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
+
+ # ------------------------------
+ # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
+ # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
+ public_subnets = local.public_subnets
+ enable_nat_gateway = true
+ single_nat_gateway = true
+ #-------------------------------
+
+ public_subnet_tags = {
+ "kubernetes.io/role/elb" = 1
+ }
+
+ private_subnet_tags = {
+ "kubernetes.io/role/internal-elb" = 1
+ # Tags subnets for Karpenter auto-discovery
+ "karpenter.sh/discovery" = local.name
+ }
+
+ tags = local.tags
+}
diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf
index 1ca3f0f22..277cf7889 100755
--- a/ai-ml/jupyterhub/addons.tf
+++ b/ai-ml/jupyterhub/addons.tf
@@ -1,7 +1,3 @@
-data "aws_eks_cluster_auth" "this" {
- name = module.eks.cluster_name
-}
-
# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM)
data "aws_acm_certificate" "issued" {
count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
@@ -16,6 +12,7 @@ data "aws_ecrpublic_authorization_token" "token" {
locals {
cognito_custom_domain = var.cognito_custom_domain
}
+
#---------------------------------------------------------------
# IRSA for EBS CSI Driver
#---------------------------------------------------------------
@@ -69,7 +66,7 @@ module "eks_blueprints_addons" {
enable_cluster_proportional_autoscaler = true
cluster_proportional_autoscaler = {
timeout = "300"
- values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
+ values = [templatefile("${path.module}/helm/coredns-autoscaler/values.yaml", {
target = "deployment/coredns"
})]
description = "Cluster Proportional Autoscaler for CoreDNS Service"
@@ -81,7 +78,7 @@ module "eks_blueprints_addons" {
enable_metrics_server = true
metrics_server = {
timeout = "300"
- values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
+ values = [templatefile("${path.module}/helm/metrics-server/values.yaml", {})]
}
#---------------------------------------
@@ -91,7 +88,7 @@ module "eks_blueprints_addons" {
cluster_autoscaler = {
timeout = "300"
create_role = true
- values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
+ values = [templatefile("${path.module}/helm/cluster-autoscaler/values.yaml", {
aws_region = var.region,
eks_cluster_id = module.eks.cluster_name
})]
@@ -109,15 +106,158 @@ module "eks_blueprints_addons" {
}
#---------------------------------------
- # CloudWatch metrics for EKS
+ # AWS Load Balancer Controller
#---------------------------------------
- enable_aws_cloudwatch_metrics = true
- aws_cloudwatch_metrics = {
- timeout = "300"
- values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
+ enable_aws_load_balancer_controller = true
+
+ #---------------------------------------
+ # Prometheus and Grafana stack
+ #---------------------------------------
+ #---------------------------------------------------------------
+ # Install Monitoring Stack with Prometheus and Grafana
+ # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
+ # 2- Grafana Admin user: admin
+ # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text`
+ #---------------------------------------------------------------
+ enable_kube_prometheus_stack = true
+ kube_prometheus_stack = {
+ values = [templatefile("${path.module}/helm/kube-prometheus-stack/values.yaml", {})]
+ chart_version = "48.1.1"
+ set_sensitive = [
+ {
+ name = "grafana.adminPassword"
+ value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
+ }
+ ],
+ }
+ #---------------------------------------
+ # AWS for FluentBit
+ #---------------------------------------
+ enable_aws_for_fluentbit = true
+ aws_for_fluentbit_cw_log_group = {
+ use_name_prefix = false
+ name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
+ retention_in_days = 30
+ }
+ aws_for_fluentbit = {
+ values = [templatefile("${path.module}/helm/aws-for-fluentbit/values.yaml", {
+ region = local.region,
+ cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
+ cluster_name = module.eks.cluster_name
+ })]
}
- enable_aws_load_balancer_controller = true
+ #---------------------------------------
+ # Additional Helm Charts
+ #---------------------------------------
+ helm_releases = {
+ storageclass = {
+ name = "storageclass"
+ description = "A Helm chart for storage configurations"
+ chart = "${path.module}/helm/storageclass"
+ }
+ karpenter-resources-cpu = {
+ name = "karpenter-resources-cpu"
+ description = "A Helm chart for karpenter CPU based resources"
+ chart = "${path.module}/helm/karpenter-resources"
+ values = [
+ <<-EOT
+ clusterName: ${module.eks.cluster_name}
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ EOT
+ ]
+ }
+ karpenter-resources-ts = {
+ name = "karpenter-resources-ts"
+ description = "A Helm chart for karpenter GPU based resources - compatible with GPU time slicing"
+ chart = "${path.module}/helm/karpenter-resources"
+ values = [
+ <<-EOT
+ name: gpu-ts
+ clusterName: ${module.eks.cluster_name}
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ instanceSizes: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
+ instanceFamilies: ["g5"]
+ taints:
+ - key: hub.jupyter.org/dedicated
+ value: "user"
+ effect: "NoSchedule"
+ - key: nvidia.com/gpu
+ effect: "NoSchedule"
+ amiFamily: Ubuntu
+ EOT
+ ]
+ }
+ karpenter-resources-mig = {
+ name = "karpenter-resources-gpu"
+ description = "A Helm chart for karpenter GPU based resources - compatible with P4d instances"
+ chart = "${path.module}/helm/karpenter-resources"
+ values = [
+ <<-EOT
+ name: gpu
+ clusterName: ${module.eks.cluster_name}
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ instanceSizes: ["24xlarge"]
+ instanceFamilies: ["p4d"]
+ taints:
+ - key: hub.jupyter.org/dedicated
+ value: "user"
+ effect: "NoSchedule"
+ - key: nvidia.com/gpu
+ effect: "NoSchedule"
+ amiFamily: Ubuntu
+ EOT
+ ]
+ }
+ karpenter-resources-inf = {
+ name = "karpenter-resources-inf"
+ description = "A Helm chart for karpenter Inferentia based resources"
+ chart = "${path.module}/helm/karpenter-resources"
+ values = [
+ <<-EOT
+ name: inferentia
+ clusterName: ${module.eks.cluster_name}
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ instanceSizes: ["8xlarge", "24xlarge"]
+ instanceFamilies: ["inf2"]
+ taints:
+ - key: aws.amazon.com/neuroncore
+ value: "true"
+ effect: "NoSchedule"
+ - key: aws.amazon.com/neuron
+ value: "true"
+ effect: "NoSchedule"
+ - key: hub.jupyter.org/dedicated
+ value: "user"
+ effect: "NoSchedule"
+ EOT
+ ]
+ }
+ karpenter-resources-trn = {
+ name = "karpenter-resources-trn"
+ description = "A Helm chart for karpenter Trainium based resources"
+ chart = "${path.module}/helm/karpenter-resources"
+ values = [
+ <<-EOT
+ name: trainium
+ clusterName: ${module.eks.cluster_name}
+ karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+ instanceSizes: ["32xlarge"]
+ instanceFamilies: ["trn1"]
+ taints:
+ - key: aws.amazon.com/neuroncore
+ value: "true"
+ effect: "NoSchedule"
+ - key: aws.amazon.com/neuron
+ value: "true"
+ effect: "NoSchedule"
+ - key: hub.jupyter.org/dedicated
+ value: "user"
+ effect: "NoSchedule"
+ EOT
+ ]
+ }
+ }
tags = local.tags
}
@@ -131,19 +271,25 @@ module "eks_data_addons" {
oidc_provider_arn = module.eks.oidc_provider_arn
+ #---------------------------------------------------------------
+ # Enable Neuron Device Plugin
+ #---------------------------------------------------------------
+ enable_aws_neuron_device_plugin = true
+
#---------------------------------------------------------------
# Enable GPU operator
#---------------------------------------------------------------
- enable_nvidia_gpu_operator = var.jupyter_notebook_support == "gpu" ? true : false
+ enable_nvidia_gpu_operator = true
nvidia_gpu_operator_helm_config = {
- values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})]
+ values = [templatefile("${path.module}/helm/nvidia-gpu-operator/values.yaml", {})]
}
+
#---------------------------------------------------------------
# JupyterHub Add-on
#---------------------------------------------------------------
enable_jupyterhub = true
jupyterhub_helm_config = {
- values = [templatefile("${path.module}/helm-values/jupyterhub-values-${var.jupyter_hub_auth_mechanism}-${var.jupyter_notebook_support}.yaml", {
+ values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", {
ssl_cert_arn = try(data.aws_acm_certificate.issued[0].arn, "")
jupyterdomain = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "")
authorize_url = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
@@ -151,7 +297,45 @@ module "eks_data_addons" {
userdata_url = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "")
client_id = try(aws_cognito_user_pool_client.user_pool_client[0].id, "")
client_secret = try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "")
+ user_pool_id = try(aws_cognito_user_pool.pool[0].id, "")
+ identity_pool_id = try(aws_cognito_identity_pool.identity_pool[0].id, "")
jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
+ region = var.region
})]
}
+
+ #---------------------------------------------------------------
+ # Kubecost Add-on
+ #---------------------------------------------------------------
+ enable_kubecost = true
+ kubecost_helm_config = {
+ values = [templatefile("${path.module}/helm/kubecost/values.yaml", {})]
+ repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+ repository_password = data.aws_ecrpublic_authorization_token.token.password
+ }
+}
+
+#---------------------------------------------------------------
+# Grafana Admin credentials resources
+#---------------------------------------------------------------
+data "aws_secretsmanager_secret_version" "admin_password_version" {
+ secret_id = aws_secretsmanager_secret.grafana.id
+ depends_on = [aws_secretsmanager_secret_version.grafana]
+}
+
+resource "random_password" "grafana" {
+ length = 16
+ special = true
+ override_special = "@_"
+}
+
+#tfsec:ignore:aws-ssm-secret-use-customer-key
+resource "aws_secretsmanager_secret" "grafana" {
+ name_prefix = "${local.name}-grafana-"
+ recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
+}
+
+resource "aws_secretsmanager_secret_version" "grafana" {
+ secret_id = aws_secretsmanager_secret.grafana.id
+ secret_string = random_password.grafana.result
}
diff --git a/ai-ml/jupyterhub/cleanup.sh b/ai-ml/jupyterhub/cleanup.sh
index 4412881ce..8438ddf84 100755
--- a/ai-ml/jupyterhub/cleanup.sh
+++ b/ai-ml/jupyterhub/cleanup.sh
@@ -2,13 +2,11 @@
set -o errexit
set -o pipefail
-read -p "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com :" acm_certificate_domain
-read -p "Enter sub-domain name for jupyterhub to be hosted, e.g. eks.example.com : " jupyterhub_domain
-
targets=(
"module.eks_data_addons"
"module.eks_blueprints_addons"
"module.eks"
+ "module.vpc"
)
#-------------------------------------------
@@ -32,7 +30,7 @@ done
#-------------------------------------------
for target in "${targets[@]}"
do
- destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty)
+ destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of $target completed successfully"
else
@@ -44,7 +42,7 @@ done
#-------------------------------------------
# Terraform destroy full
#-------------------------------------------
-destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty)
+destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of all targets completed successfully"
else
diff --git a/ai-ml/jupyterhub/cognito.tf b/ai-ml/jupyterhub/cognito.tf
new file mode 100644
index 000000000..57338986b
--- /dev/null
+++ b/ai-ml/jupyterhub/cognito.tf
@@ -0,0 +1,224 @@
+#---------------------------------------------------------------
+# Lambda function for pre token generation
+#----------------------------------------------------------------
+
+data "aws_iam_policy_document" "assume_role" {
+ statement {
+ effect = "Allow"
+ principals {
+ type = "Service"
+ identifiers = ["lambda.amazonaws.com", "cognito-idp.amazonaws.com"]
+ }
+ actions = ["sts:AssumeRole"]
+ }
+}
+
+data "aws_iam_policy" "lambda_execution_policy" {
+ arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
+}
+
+resource "aws_iam_role" "iam_for_lambda" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ name = "iam_for_lambda"
+ assume_role_policy = data.aws_iam_policy_document.assume_role.json
+}
+
+resource "aws_iam_role_policy_attachment" "lambda_policy_attachment" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ role = aws_iam_role.iam_for_lambda[0].name
+ policy_arn = data.aws_iam_policy.lambda_execution_policy.arn
+}
+
+data "archive_file" "lambda" {
+ type = "zip"
+ output_path = "/tmp/lambda.zip"
+ source {
+ filename = "index.mjs"
+ content = <<-EOF
+ export const handler = async (event) => {
+ event.response = {
+ claimsOverrideDetails: {
+ claimsToAddOrOverride: {
+ department: "engineering",
+ },
+ },
+ };
+
+ return event;
+ };
+
+ EOF
+ }
+}
+
+resource "aws_lambda_function" "pretoken_trigger" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ function_name = "pretoken-trigger-function"
+ filename = data.archive_file.lambda.output_path
+ source_code_hash = data.archive_file.lambda.output_base64sha256
+
+ runtime = "nodejs18.x"
+ handler = "index.handler"
+
+ role = aws_iam_role.iam_for_lambda[0].arn
+}
+
+#---------------------------------------------------------------
+# Cognito pool, domain and client creation.
+# This can be used
+# Auth integration later.
+#----------------------------------------------------------------
+resource "aws_cognito_user_pool" "pool" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ name = "jupyterhub-userpool"
+
+ username_attributes = ["email"]
+ auto_verified_attributes = ["email"]
+
+ password_policy {
+ minimum_length = 6
+ }
+
+ lambda_config {
+ pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn
+ }
+}
+
+resource "aws_cognito_user_pool_domain" "domain" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ domain = local.cognito_custom_domain
+ user_pool_id = aws_cognito_user_pool.pool[0].id
+}
+
+resource "aws_cognito_user_pool_client" "user_pool_client" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ name = "jupyter-client"
+ access_token_validity = 1
+ token_validity_units {
+ access_token = "days"
+ }
+ callback_urls = ["https://${var.jupyterhub_domain}/hub/oauth_callback"]
+ user_pool_id = aws_cognito_user_pool.pool[0].id
+ allowed_oauth_flows_user_pool_client = true
+ allowed_oauth_flows = ["code"]
+ allowed_oauth_scopes = ["openid", "email"]
+ generate_secret = true
+ supported_identity_providers = [
+ "COGNITO"
+ ]
+
+ depends_on = [aws_cognito_user_pool_domain.domain]
+}
+
+#---------------------------------------------------------------
+# Cognito identity pool creation.
+#----------------------------------------------------------------
+resource "aws_cognito_identity_pool" "identity_pool" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ identity_pool_name = "jupyterhub-identity-pool"
+ allow_unauthenticated_identities = false
+ cognito_identity_providers {
+ client_id = aws_cognito_user_pool_client.user_pool_client[0].id
+ provider_name = aws_cognito_user_pool.pool[0].endpoint
+ server_side_token_check = true
+ }
+
+ depends_on = [aws_cognito_user_pool_client.user_pool_client]
+}
+
+resource "aws_s3_bucket" "jupyterhub_bucket" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ bucket_prefix = "jupyterhub-test-bucket-"
+}
+
+resource "aws_s3_object" "engineering_object" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ bucket = aws_s3_bucket.jupyterhub_bucket[0].id
+ key = "engineering/"
+}
+
+resource "aws_s3_object" "legal_object" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ bucket = aws_s3_bucket.jupyterhub_bucket[0].id
+ key = "legal/"
+}
+
+#---------------------------------------------------------------
+# IAM role for a team member from the engineering department
+# In theory there would be other departments such as "legal"
+#----------------------------------------------------------------
+resource "aws_iam_role" "cognito_authenticated_engineering_role" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+
+ name = "EngineeringTeamRole"
+
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"],
+ Effect = "Allow",
+ Principal = {
+ Federated = "cognito-identity.amazonaws.com"
+ },
+ Condition = {
+ StringEquals = {
+ "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id
+ },
+ "ForAnyValue:StringLike" : {
+ "cognito-identity.amazonaws.com:amr" : "authenticated"
+ }
+ }
+ }
+ ]
+ })
+}
+
+resource "aws_iam_role_policy" "s3_cognito_engineering_policy" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ name = "s3_cognito_engineering_policy"
+ role = aws_iam_role.cognito_authenticated_engineering_role[0].id
+
+ policy = <<-EOF
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Action": ["s3:List*"],
+ "Resource": "*",
+ "Condition": {
+ "StringEquals": {
+ "s3:prefix": "$${aws:PrincipalTag/department}"
+ }
+ }
+ }
+ ]
+}
+EOF
+}
+
+resource "aws_cognito_identity_pool_provider_principal_tag" "example" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id
+ identity_provider_name = aws_cognito_user_pool.pool[0].endpoint
+ use_defaults = false
+ principal_tags = {
+ department = "department"
+ }
+}
+
+resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ name = "S3ReadOnlyAccessAttachment"
+ policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
+ roles = [aws_iam_role.cognito_authenticated_engineering_role[0].name]
+}
+
+resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" {
+ count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+ identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id
+ roles = {
+ authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn
+ }
+}
diff --git a/ai-ml/jupyterhub/examples/create_image.sh b/ai-ml/jupyterhub/examples/create_image.sh
new file mode 100755
index 000000000..ae33fcb7e
--- /dev/null
+++ b/ai-ml/jupyterhub/examples/create_image.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Set the AWS region and the name of the ECR repository
+
+REGION=us-west-2
+ECR_REPO_NAME=jupyterhub-pytorch-neuron-pytorch
+DOCKER_FILE=docker/jupyterhub-pytorch-neuron-pytorch.Dockerfile
+
+# Check if the ECR repository exists
+if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$REGION" >/dev/null 2>&1; then
+ echo "ECR repository '$ECR_REPO_NAME' already exists."
+
+ # Get the ECR_REPO_URI for the existing repository
+ ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text)
+ echo "Repository URL: $ECR_REPO_URI"
+else
+ # Create a new ECR repository with the specified name and region
+ aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$REGION"
+
+ # Retrieve the URL of the newly created ECR repository
+ ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text)
+ echo "Repository URL: $ECR_REPO_URI"
+fi
+
+# Log in to Amazon ECR using docker
+echo -e "Logging in to Amazon ECR..."
+aws ecr get-login-password --region "$REGION" | docker login --username AWS --password-stdin "$ECR_REPO_URI"
+
+# Build the docker image using the provided jupyterhub-pytorch-neuron.Dockerfile and tag it with the ECR repository URI
+echo -e "Building, tagging and pushing docker image... $ECR_REPO_URI:latest"
+# docker build -f docker/jupyterhub-pytorch-neuron.Dockerfile-jupterhub-inferentia-pytorch -t "$ECR_REPO_URI:latest" .
+docker buildx build --push --tag "$ECR_REPO_URI:latest" -o type=image --platform=linux/amd64 -f $DOCKER_FILE .
+
+# Wait for 5 seconds
+sleep 5
+echo -e "Sleeping for 5 seconds..."
diff --git a/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile b/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile
new file mode 100644
index 000000000..687e7a52f
--- /dev/null
+++ b/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile
@@ -0,0 +1,34 @@
+# Use the Jupyter base notebook with Python 3.10 as the base image
+FROM jupyter/base-notebook:python-3.10
+
+# Maintainer label
+LABEL maintainer="DoEKS"
+
+# Set environment variables to non-interactive (this prevents some prompts)
+ENV DEBIAN_FRONTEND=non-interactive
+
+# Switch to root to add Neuron repo and install necessary packages
+USER root
+
+# Install gnupg and other required packages
+RUN apt-get update -y && \
+ apt-get install -y gnupg git g++
+
+RUN \
+ . /etc/os-release && \
+ echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
+ wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
+ apt-get update -y && \
+ apt-get install aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y
+
+# Switch back to jovyan user for Python package installations
+USER jovyan
+
+# Set pip repository pointing to the Neuron repository and install required Python packages
+RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
+ pip install transformers-neuronx sentencepiece transformers wget awscli ipywidgets neuronx-cc==2.* torch-neuronx torchvision ipykernel environment_kernels && \
+ # Install new Jupyter Notebook kernel
+ python -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+
+# Add Neuron path to PATH
+ENV PATH /opt/aws/neuron/bin:$PATH
diff --git a/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile b/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile
new file mode 100644
index 000000000..6f167444d
--- /dev/null
+++ b/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile
@@ -0,0 +1,34 @@
+# Use the Jupyter base notebook with Python 3.10 as the base image
+FROM jupyter/base-notebook:python-3.10
+
+# Maintainer label
+LABEL maintainer="DoEKS"
+
+# Set environment variables to non-interactive (this prevents some prompts)
+ENV DEBIAN_FRONTEND=non-interactive
+
+# Switch to root to add Neuron repo and install necessary packages
+USER root
+
+# Install gnupg and other required packages
+RUN apt-get update -y && \
+ apt-get install -y gnupg git g++
+
+RUN \
+ . /etc/os-release && \
+ echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
+ wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
+ apt-get update -y && \
+ apt-get install aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y
+
+# Switch back to jovyan user for Python package installations
+USER jovyan
+
+# Set pip repository pointing to the Neuron repository and install required Python packages
+RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
+ pip install transformers-neuronx sentencepiece transformers wget awscli ipywidgets neuronx-cc==2.* tensorflow-neuronx ipykernel environment_kernels && \
+ # Install new Jupyter Notebook kernel
+ python -m ipykernel install --user --name aws_neuron_venv_tensorflow --display-name "Python (tensorflow-neuronx)"
+
+# Add Neuron path to PATH
+ENV PATH /opt/aws/neuron/bin:$PATH
diff --git a/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb b/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb
new file mode 100644
index 000000000..75b7e55e3
--- /dev/null
+++ b/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "851f73c7-068a-4e44-861b-511a3d2caf16",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting tensorrt\n",
+ " Downloading tensorrt-8.6.1.post1.tar.gz (18 kB)\n",
+ " Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25hBuilding wheels for collected packages: tensorrt\n",
+ " Building wheel for tensorrt (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for tensorrt: filename=tensorrt-8.6.1.post1-py2.py3-none-any.whl size=17281 sha256=055cb554c81337084ecbeece43281f85702addd8902e31b0004f43a2fb65f518\n",
+ " Stored in directory: /home/jovyan/.cache/pip/wheels/f4/c8/0e/b79b08e45752491b9acfdbd69e8a609e8b2ed7640dda5a3e59\n",
+ "Successfully built tensorrt\n",
+ "Installing collected packages: tensorrt\n",
+ "Successfully installed tensorrt-8.6.1.post1\n",
+ "Requirement already satisfied: matplotlib in /opt/conda/lib/python3.10/site-packages (3.7.1)\n",
+ "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
+ "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (23.0)\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.0.7)\n",
+ "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (9.4.0)\n",
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
+ "Requirement already satisfied: numpy>=1.20 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.23.5)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (4.39.3)\n",
+ "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n"
+ ]
+ }
+ ],
+ "source": [
+ "! python3 -m pip install --upgrade tensorrt\n",
+ "! pip3 install matplotlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "5c0d4d12-6c12-4cdc-bb79-6ad2a808e7a1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "