feat: Jupyterhub blueprint upgrade (awslabs#554)

Signed-off-by: Vara Bonthu <[email protected]>
ovaleanu · Aug 10, 2024 · c2091dd · c2091dd
1 parent 6021a28
commit c2091dd
Show file tree

Hide file tree

Showing 25 changed files with 563 additions and 594 deletions.
diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf
diff --git a/ai-ml/jupyterhub/examples/test-pods/timeslicing-test.yaml b/ai-ml/jupyterhub/examples/test-pods/timeslicing-test.yaml
@@ -15,7 +15,7 @@ spec:
         app: time-slicing-verification
     spec:
       nodeSelector:
-        provisioner: gpu-ts
+        NodePool: gpu-ts
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists

diff --git a/ai-ml/jupyterhub/examples/test-pods/verify-gpu-access.yaml b/ai-ml/jupyterhub/examples/test-pods/verify-gpu-access.yaml
@@ -14,7 +14,7 @@ spec:
         app: verify-gpu
     spec:
       nodeSelector:
-        karpenter.sh/provisioner-name: gpu   # Force schedule a node with time slicing support
+        NodePool: gpu-mig   # Force schedule a node with time slicing support
       tolerations:   # To tolerate the taint on the nodes
       - key: "nvidia.com/gpu"
         operator: "Exists"

diff --git a/ai-ml/jupyterhub/helm/coredns-autoscaler/values.yaml b/ai-ml/jupyterhub/helm/coredns-autoscaler/values.yaml
diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml
@@ -123,6 +123,7 @@ singleuser:
       kubespawner_override:
         node_selector:
           NodePool: trainium
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: aws.amazon.com/neuroncore
             operator: Exists
@@ -186,6 +187,7 @@ singleuser:
         image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
         node_selector:
           NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"
@@ -210,6 +212,7 @@ singleuser:
         node_selector:
           provisioner: cluster-autoscaler
           node.kubernetes.io/instance-type: p4d.24xlarge
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"
@@ -232,8 +235,8 @@ singleuser:
       kubespawner_override:
         image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
         node_selector:
-          node.kubernetes.io/instance-type: p4d.24xlarge
-          NodePool: gpu
+          NodePool: gpu-mig
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"

diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml
@@ -67,6 +67,7 @@ singleuser:
       kubespawner_override:
         node_selector:
           NodePool: trainium
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: aws.amazon.com/neuroncore
             operator: Exists
@@ -127,9 +128,10 @@ singleuser:
       description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
       kubespawner_override:
         # namespace: data-team-a
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
         node_selector:
           NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"
@@ -150,10 +152,11 @@ singleuser:
     - display_name: Data Science (GPU + MIG on P4d.24xlarge)
       description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
       kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
         node_selector:
           provisioner: cluster-autoscaler
           node.kubernetes.io/instance-type: p4d.24xlarge
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"
@@ -174,10 +177,10 @@ singleuser:
     - display_name: Data Science (GPU - P4d.24xlarge)
       description: "GPU with P4d instances | Karpenter Autoscaler"
       kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
         node_selector:
-          node.kubernetes.io/instance-type: p4d.24xlarge
-          NodePool: gpu
+          NodePool: gpu-mig
+          hub.jupyter.org/node-purpose: user
         tolerations:
           - key: "nvidia.com/gpu"
             operator: "Exists"

diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/Chart.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/Chart.yaml
diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-class.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-class.yaml
diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-pool.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-pool.yaml
diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/values.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/values.yaml