Skip to content

Commit

Permalink
works on local environment, need to add nlb
Browse files Browse the repository at this point in the history
  • Loading branch information
natmhnty committed Jul 22, 2024
1 parent fe4d541 commit 99ea627
Show file tree
Hide file tree
Showing 15 changed files with 194 additions and 75 deletions.
9 changes: 6 additions & 3 deletions manifests/modules/aiml/chatbot/.workshop/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

set -e

#we want to delete the graviton cluster
delete-nodegroup graviton

logmessage "Deleting AIML resources..."

kubectl delete namespace aiml --ignore-not-found

#add this for deleting the llama2 namespace
kubectl delete namespace llama2 --ignore-not-found

#add this for deleting the ingress
kubectl delete ingress -n llama2 llama2 --ignore-not-found

logmessage "Deleting Karpenter NodePool and EC2NodeClass..."

delete-all-if-crd-exists nodepools.karpenter.sh
Expand Down
36 changes: 6 additions & 30 deletions manifests/modules/aiml/chatbot/.workshop/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ provider "aws" {
alias = "virginia"
}

data "aws_region" "current" {}
#data "aws_region" "current" {}

data "aws_ecrpublic_authorization_token" "token" {
provider = aws.virginia
Expand All @@ -22,7 +22,9 @@ module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "1.16.3"

enable_karpenter = true
enable_karpenter = true
enable_aws_load_balancer_controller = true
create_kubernetes_resources = false

karpenter_enable_spot_termination = true
karpenter_enable_instance_profile_creation = true
Expand All @@ -35,6 +37,7 @@ module "eks_blueprints_addons" {
cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint
cluster_version = var.eks_cluster_version
oidc_provider_arn = var.addon_context.eks_oidc_provider_arn

}

data "aws_subnets" "private" {
Expand All @@ -56,33 +59,6 @@ resource "aws_s3_bucket" "chatbot" {
tags = var.tags
}

#resource "aws_iam_role" "graviton_node" {
# name = "${var.addon_context.eks_cluster_id}-graviton-node"

# assume_role_policy = jsonencode({
#Version = "2012-10-17"
#Statement = [
#{
#Action = "sts:AssumeRole"
#Effect = "Allow"
#Sid = ""
#Principal = {
#Service = "ec2.amazonaws.com"
#}
#},
#]
# })

#managed_policy_arns = [
# "arn:${var.addon_context.aws_partition_id}:iam::aws:policy/AmazonEKS_CNI_Policy",
# "arn:${var.addon_context.aws_partition_id}:iam::aws:policy/AmazonEKSWorkerNodePolicy",
# "arn:${var.addon_context.aws_partition_id}:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
# "arn:${var.addon_context.aws_partition_id}:iam::aws:policy/AmazonSSMManagedInstanceCore"
#]

#tags = var.tags
#}

module "iam_assumable_role_chatbot" {
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
version = "5.39.1"
Expand All @@ -98,7 +74,7 @@ module "iam_assumable_role_chatbot" {
resource "aws_iam_policy" "chatbot" {
name = "${var.addon_context.eks_cluster_id}-chatbot"
path = "/"
description = "IAM policy for the inferenct workload"
description = "IAM policy for the chatbot workload"

policy = <<EOF
{
Expand Down
19 changes: 5 additions & 14 deletions manifests/modules/aiml/chatbot/.workshop/terraform/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
output "environment_variables" {
description = "Environment variables to be added to the IDE shell"
value = {
AIML_NEURON_ROLE_ARN = module.iam_assumable_role_chatbot.iam_role_arn
AIML_NEURON_BUCKET_NAME = resource.aws_s3_bucket.chatbot.id
AIML_DL_IMAGE = "763104351884.dkr.ecr.${data.aws_region.current.name}.amazonaws.com/pytorch-chatbot-neuron:1.13.1-neuron-py310-sdk2.12.0-ubuntu20.04"
AIML_SUBNETS = "${data.aws_subnets.private.ids[0]},${data.aws_subnets.private.ids[1]},${data.aws_subnets.private.ids[2]}"
KARPENTER_NODE_ROLE = module.eks_blueprints_addons.karpenter.node_iam_role_name
KARPENTER_ARN = module.eks_blueprints_addons.karpenter.node_iam_role_arn
AIML_SUBNETS = "${data.aws_subnets.private.ids[0]},${data.aws_subnets.private.ids[1]},${data.aws_subnets.private.ids[2]}"
KARPENTER_NODE_ROLE = module.eks_blueprints_addons.karpenter.node_iam_role_name
KARPENTER_ARN = module.eks_blueprints_addons.karpenter.node_iam_role_arn
LBC_CHART_VERSION = var.load_balancer_controller_chart_version
LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn
}
}

#output "subnet_details" {
#value = merge({
#GRAVITON_NODE_ROLE = aws_iam_role.graviton_node.arn
#}, {
#for index, id in data.aws_subnets.private.ids : "PRIMARY_SUBNET_${index + 1}" => id
#})
#}
8 changes: 8 additions & 0 deletions manifests/modules/aiml/chatbot/.workshop/terraform/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,11 @@ variable "resources_precreated" {
description = "Have expensive resources been created already"
type = bool
}

# tflint-ignore: terraform_unused_declarations
variable "load_balancer_controller_chart_version" {
description = "The chart version of aws-load-balancer-controller to use"
type = string
# renovate-helm: depName=aws-load-balancer-controller
default = "1.8.1"
}
5 changes: 0 additions & 5 deletions manifests/modules/aiml/chatbot/base/kustomization.yaml

This file was deleted.

4 changes: 0 additions & 4 deletions manifests/modules/aiml/chatbot/base/namespace.yaml

This file was deleted.

7 changes: 0 additions & 7 deletions manifests/modules/aiml/chatbot/base/serviceaccount.yaml

This file was deleted.

59 changes: 59 additions & 0 deletions manifests/modules/aiml/chatbot/k8s-neuron-device-plugin-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# rbac.yaml
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
95 changes: 95 additions & 0 deletions manifests/modules/aiml/chatbot/k8s-neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
# Uncomment the annotation below if k8s version is 1.13 or lower
# annotations:
# scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
# Uncomment following matchExpressions if using k8s 1.16 or lower
#- matchExpressions:
# - key: "beta.kubernetes.io/instance-type"
# operator: In
# values:
# - inf1.xlarge
# - inf1.2xlarge
# - inf1.6xlarge
# - inf1.24xlarge
# - inf2.xlarge
# - inf2.8xlarge
# - inf2.24xlarge
# - inf2.48xlarge
# - trn1.2xlarge
# - trn1.32xlarge
# - trn1n.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
containers:
# Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
- image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0
imagePullPolicy: Always
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
10 changes: 5 additions & 5 deletions manifests/modules/aiml/chatbot/nodepool/nodepool-inf2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ spec:
# limits:
# cpu: "10000"
disruption:
consolidationPolicy: WhenUnderutilized
#consolidationPolicy: WhenUnderutilized
consolidateAfter: 300s
#consolidationPolicy: WhenEmpty
consolidationPolicy: WhenEmpty
expireAfter: 720h # 30 * 24h = 720h

---
Expand All @@ -123,13 +123,13 @@ spec:
encrypted: true
volumeSize: 500Gi #originally 100Gi
volumeType: gp3
role: karpenter-eks-workshop-20240719154052842800000003
role: ${KARPENTER_NODE_ROLE}
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: eks-workshop
karpenter.sh/discovery: ${EKS_CLUSTER_NAME}
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: eks-workshop
karpenter.sh/discovery: ${EKS_CLUSTER_NAME}
tags:
app.kubernetes.io/created-by: eks-workshop

Expand Down
10 changes: 5 additions & 5 deletions manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ spec:
limits:
cpu: "10000"
disruption:
consolidationPolicy: WhenUnderutilized
#consolidationPolicy: WhenUnderutilized
consolidateAfter: 300s
#consolidationPolicy: WhenEmpty
consolidationPolicy: WhenEmpty
expireAfter: 720h # 30 * 24h = 720h

---
Expand All @@ -102,12 +102,12 @@ spec:
volumeSize: 200Gi #originally 100Gi
volumeType: gp3
detailedMonitoring: true
role: karpenter-eks-workshop-20240719154052842800000003
role: ${KARPENTER_NODE_ROLE}
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: eks-workshop
karpenter.sh/discovery: ${EKS_CLUSTER_NAME}
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: eks-workshop
karpenter.sh/discovery: ${EKS_CLUSTER_NAME}
tags:
app.kubernetes.io/created-by: eks-workshop
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,12 @@ metadata:
name: llama2
namespace: llama2
annotations:
nginx.ingress.kubernetes.io/rewrite-target: "/$1"
#nginx.ingress.kubernetes.io/rewrite-target: "/$1"
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/healthcheck-path: /actuator/health/liveness
spec:
ingressClassName: nginx
ingressClassName: alb
rules:
- http:
paths:
Expand Down

0 comments on commit 99ea627

Please sign in to comment.