diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48d04253a..aba839fcc 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: trailing-whitespace args: ['--markdown-linebreak-ext=md'] @@ -10,7 +10,7 @@ repos: - id: detect-aws-credentials args: ['--allow-missing-credentials'] - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.81.0 + rev: v1.83.5 hooks: - id: terraform_fmt - id: terraform_docs diff --git a/README.md b/README.md index 34e72367b..e4b06c80b 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ ![Data on EKS](website/static/img/doeks-logo-green.png) # [Data on Amazon EKS (DoEKS)](https://awslabs.github.io/data-on-eks/) +(pronounce Do.eks) + [![plan-examples](https://github.com/awslabs/data-on-eks/actions/workflows/plan-examples.yml/badge.svg?branch=main)](https://github.com/awslabs/data-on-eks/actions/workflows/plan-examples.yml) diff --git a/ai-ml/emr-spark-rapids/eks.tf b/ai-ml/emr-spark-rapids/eks.tf index 02c60f12d..e1cf92e39 100644 --- a/ai-ml/emr-spark-rapids/eks.tf +++ b/ai-ml/emr-spark-rapids/eks.tf @@ -9,6 +9,7 @@ module "eks" { cluster_name = local.name cluster_version = var.eks_cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. vpc_id = module.vpc.vpc_id diff --git a/ai-ml/jark-stack/terraform/README.md b/ai-ml/jark-stack/terraform/README.md new file mode 100644 index 000000000..e7567f85a --- /dev/null +++ b/ai-ml/jark-stack/terraform/README.md @@ -0,0 +1,62 @@ +# JupyterHub, Argo, Ray, Kubernetes + +Docs coming soon... + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [aws](#requirement\_aws) | >= 3.72 | +| [helm](#requirement\_helm) | >= 2.4.1 | +| [http](#requirement\_http) | >= 3.3 | +| [kubectl](#requirement\_kubectl) | >= 1.14 | +| [kubernetes](#requirement\_kubernetes) | >= 2.10 | +| [random](#requirement\_random) | >= 3.1 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | >= 3.72 | +| [kubernetes](#provider\_kubernetes) | >= 2.10 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [data\_addons](#module\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.1 | +| [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 | +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | +| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 | +| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | + +## Resources + +| Name | Type | +|------|------| +| [kubernetes_annotations.disable_gp2](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource | +| [kubernetes_config_map_v1.notebook](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map_v1) | resource | +| [kubernetes_namespace_v1.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | +| [kubernetes_secret_v1.huggingface_token](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | +| [kubernetes_storage_class.default_gp3](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | +| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no | +| [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no | +| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"jark-stack"` | no | +| [region](#input\_region) | region | `string` | `"us-west-2"` | no | +| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no | +| [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig | + diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf new file mode 100644 index 000000000..3f2a91a2e --- /dev/null +++ b/ai-ml/jark-stack/terraform/addons.tf @@ -0,0 +1,186 @@ +#--------------------------------------------------------------- +# GP3 Encrypted Storage Class +#--------------------------------------------------------------- +resource "kubernetes_annotations" "disable_gp2" { + annotations = { + "storageclass.kubernetes.io/is-default-class" : "false" + } + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + metadata { + name = "gp2" + } + force = true + + depends_on = [module.eks.eks_cluster_id] +} + +resource "kubernetes_storage_class" "default_gp3" { + metadata { + name = "gp3" + annotations = { + "storageclass.kubernetes.io/is-default-class" : "true" + } + } + + storage_provisioner = "ebs.csi.aws.com" + reclaim_policy = "Delete" + allow_volume_expansion = true + volume_binding_mode = "WaitForFirstConsumer" + parameters = { + fsType = "ext4" + encrypted = true + type = "gp3" + } + + depends_on = [kubernetes_annotations.disable_gp2] +} + +#--------------------------------------------------------------- +# IRSA for EBS CSI Driver +#--------------------------------------------------------------- +module "ebs_csi_driver_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.20" + role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") + attach_ebs_csi_policy = true + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + tags = local.tags +} + +#--------------------------------------------------------------- +# EKS Blueprints Addons +#--------------------------------------------------------------- +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.2" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------- + # Amazon EKS Managed Add-ons + #--------------------------------------- + eks_addons = { + aws-ebs-csi-driver = { + service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn + } + coredns = { + preserve = true + } + kube-proxy = { + preserve = true + } + # VPC CNI uses worker node IAM role policies + vpc-cni = { + preserve = true + } + } + + #--------------------------------------- + # AWS Load Balancer Controller Add-on + #--------------------------------------- + enable_aws_load_balancer_controller = true + # turn off the mutating webhook for services because we are using + # service.beta.kubernetes.io/aws-load-balancer-type: external + aws_load_balancer_controller = { + set = [{ + name = "enableServiceMutatorWebhook" + value = "false" + }] + } + + #--------------------------------------- + # Ingress Nginx Add-on + #--------------------------------------- + enable_ingress_nginx = true + ingress_nginx = { + values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] + } + + helm_releases = { + #--------------------------------------- + # NVIDIA Device Plugin Add-on + #--------------------------------------- + nvidia-device-plugin = { + description = "A Helm chart for NVIDIA Device Plugin" + namespace = "nvidia-device-plugin" + create_namespace = true + chart = "nvidia-device-plugin" + chart_version = "0.14.0" + repository = "https://nvidia.github.io/k8s-device-plugin" + values = [file("${path.module}/helm-values/nvidia-values.yaml")] + } + } +} + +#--------------------------------------------------------------- +# Data on EKS Kubernetes Addons +#--------------------------------------------------------------- +module "data_addons" { + source = "aws-ia/eks-data-addons/aws" + version = "~> 1.1" # ensure to update this to the latest/desired version + + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------------------------------- + # JupyterHub Add-on + #--------------------------------------------------------------- + enable_jupyterhub = true + jupyterhub_helm_config = { + namespace = kubernetes_namespace_v1.jupyterhub.id + create_namespace = false + values = [file("${path.module}/helm-values/jupyterhub-values.yaml")] + } + + #--------------------------------------------------------------- + # KubeRay Operator Add-on + #--------------------------------------------------------------- + enable_kuberay_operator = true + + depends_on = [ + kubernetes_secret_v1.huggingface_token, + kubernetes_config_map_v1.notebook + ] +} + + +#--------------------------------------------------------------- +# Additional Resources +#--------------------------------------------------------------- + +resource "kubernetes_namespace_v1" "jupyterhub" { + metadata { + name = "jupyterhub" + } +} + + +resource "kubernetes_secret_v1" "huggingface_token" { + metadata { + name = "hf-token" + namespace = kubernetes_namespace_v1.jupyterhub.id + } + + data = { + token = var.huggingface_token + } +} + +resource "kubernetes_config_map_v1" "notebook" { + metadata { + name = "notebook" + namespace = kubernetes_namespace_v1.jupyterhub.id + } + + data = { + "dogbooth.ipynb" = file("${path.module}/src/notebook/dogbooth.ipynb") + } +} diff --git a/ai-ml/jark-stack/terraform/cleanup.sh b/ai-ml/jark-stack/terraform/cleanup.sh new file mode 100755 index 000000000..797c2de67 --- /dev/null +++ b/ai-ml/jark-stack/terraform/cleanup.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +read -p "Enter the region: " region +export AWS_DEFAULT_REGION=$region + +echo "Destroying RayService..." + +# Delete the Ingress/SVC before removing the addons +TMPFILE=$(mktemp) +terraform -chdir=$SCRIPTDIR output -raw configure_kubectl > "$TMPFILE" +# check if TMPFILE contains the string "No outputs found" +if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then + echo "No outputs found, skipping kubectl delete" + source "$TMPFILE" + kubectl delete -f src/service/ray-service.yaml +fi + + +# List of Terraform modules to apply in sequence +targets=( + "module.data_addons" + "module.eks_blueprints_addons" + "module.eks" + "module.vpc" +) + +# Destroy modules in sequence +for target in "${targets[@]}" +do + echo "Destroying module $target..." + destroy_output=$(terraform destroy -target="$target" -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then + echo "SUCCESS: Terraform destroy of $target completed successfully" + else + echo "FAILED: Terraform destroy of $target failed" + exit 1 + fi +done + +echo "Destroying Load Balancers..." + +for arn in $(aws resourcegroupstaggingapi get-resources \ + --resource-type-filters elasticloadbalancing:loadbalancer \ + --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'ResourceTagMappingList[].ResourceARN' \ + --output text); do \ + aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \ + done + +echo "Destroying Target Groups..." +for arn in $(aws resourcegroupstaggingapi get-resources \ + --resource-type-filters elasticloadbalancing:targetgroup \ + --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'ResourceTagMappingList[].ResourceARN' \ + --output text); do \ + aws elbv2 delete-target-group --target-group-arn "$arn"; \ + done + +echo "Destroying Security Groups..." +for sg in $(aws ec2 describe-security-groups \ + --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'SecurityGroups[].GroupId' --output text); do \ + aws ec2 delete-security-group --group-id "$sg"; \ + done + +## Final destroy to catch any remaining resources +echo "Destroying remaining resources..." +destroy_output=$(terraform destroy -var="region=$region"-auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then + echo "SUCCESS: Terraform destroy of all modules completed successfully" +else + echo "FAILED: Terraform destroy of all modules failed" + exit 1 +fi diff --git a/ai-ml/jark-stack/terraform/eks.tf b/ai-ml/jark-stack/terraform/eks.tf new file mode 100644 index 000000000..04c7fd409 --- /dev/null +++ b/ai-ml/jark-stack/terraform/eks.tf @@ -0,0 +1,151 @@ +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.15" + + cluster_name = local.name + cluster_version = var.eks_cluster_version + + # if true, Your cluster API server is accessible from the internet. + # You can, optionally, limit the CIDR blocks that can access the public endpoint. + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. + cluster_endpoint_public_access = true + + vpc_id = module.vpc.vpc_id + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the EKS Control Plane ENIs will be created + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + } + ] + #--------------------------------------- + # Note: This can further restricted to specific required for each Add-on and your application + #--------------------------------------- + # Extend cluster security group rules + cluster_security_group_additional_rules = { + ingress_nodes_ephemeral_ports_tcp = { + description = "Nodes on ephemeral ports" + protocol = "tcp" + from_port = 0 + to_port = 65535 + type = "ingress" + source_node_security_group = true + } + } + + node_security_group_additional_rules = { + # Allows Control Plane Nodes to talk to Worker nodes on all ports. + # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. + # This can be restricted further to specific port based on the requirement for each Add-on + # e.g., coreDNS 53, metrics-server 4443. + # Update this according to your security requirements if needed + ingress_cluster_to_node_all_traffic = { + description = "Cluster API to Nodegroup all traffic" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + source_cluster_security_group = true + } + } + + eks_managed_node_group_defaults = { + iam_role_additional_policies = { + # Not required, but used in the example to access the nodes to inspect mounted volumes + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + + ebs_optimized = true + # This block device is used only for root volume. Adjust volume according to your size. + # NOTE: Don't use this volume for ML workloads + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } + } + + eks_managed_node_groups = { + # It's recommended to have a Managed Node group for hosting critical add-ons + # It's recommeded to use Karpenter to place your workloads instead of using Managed Node groups + # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. + core_node_group = { + name = "core-node-group" + description = "EKS Core node group for hosting system add-ons" + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) + + # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2 + ami_type = "AL2_x86_64" # Use this for Graviton AL2_ARM_64 + min_size = 2 + max_size = 8 + desired_size = 2 + + instance_types = ["m5.xlarge"] + + labels = { + WorkerType = "ON_DEMAND" + NodeGroupType = "core" + } + + tags = merge(local.tags, { + Name = "core-node-grp" + }) + } + + # GPU Nodegroup for JupyterHub Notebook and Ray Service + gpu1 = { + name = "gpu-node-grp" + description = "EKS Node Group to run GPU workloads" + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) + + ami_type = "AL2_x86_64_GPU" + min_size = 1 + max_size = 1 + desired_size = 1 + + instance_types = ["g5.12xlarge"] + + labels = { + WorkerType = "ON_DEMAND" + NodeGroupType = "gpu" + } + + taints = { + gpu = { + key = "nvidia.com/gpu" + effect = "NO_SCHEDULE" + operator = "EXISTS" + } + } + + tags = merge(local.tags, { + Name = "gpu-node-grp" + }) + } + } +} diff --git a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml new file mode 100644 index 000000000..c8b1a5d74 --- /dev/null +++ b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml @@ -0,0 +1,11 @@ +controller: + service: + externalTrafficPolicy: "Local" + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC + targetPorts: + http: http + https: http diff --git a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml new file mode 100644 index 000000000..fcad06b62 --- /dev/null +++ b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml @@ -0,0 +1,59 @@ +hub: + config: + Authenticator: + admin_users: + - admin1 + allowed_users: + - user1 + # testing only - do not do this for production + DummyAuthenticator: + password: never-do-this + JupyterHub: + authenticator_class: dummy +proxy: + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' + service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 +singleuser: + image: + name: public.ecr.aws/h3o5n2r0/gpu-jupyter + tag: v1.5_cuda-11.6_ubuntu-20.04_python-only + pullPolicy: Always + cmd: null + startTimeout: 600 + memory: + guarantee: 24G + extraResource: + limits: + nvidia.com/gpu: "1" + extraEnv: + HUGGING_FACE_HUB_TOKEN: + valueFrom: + secretKeyRef: + name: hf-token + key: token + storage: + capacity: 100Gi + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + - name: notebook + configMap: + name: notebook + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm + - name: notebook + mountPath: /home/jovyan/dogbooth + extraTolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +scheduling: + userScheduler: + enabled: false diff --git a/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml b/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml new file mode 100644 index 000000000..9fa59599e --- /dev/null +++ b/ai-ml/jark-stack/terraform/helm-values/nvidia-values.yaml @@ -0,0 +1,10 @@ +gfd: + enabled: true +nfd: + enabled: true + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" diff --git a/ai-ml/jark-stack/terraform/install.sh b/ai-ml/jark-stack/terraform/install.sh new file mode 100755 index 000000000..18f2a94d3 --- /dev/null +++ b/ai-ml/jark-stack/terraform/install.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +read -p "Enter the region: " region +export AWS_DEFAULT_REGION=$region + +# List of Terraform modules to apply in sequence +targets=( + "module.vpc" + "module.eks" +) + +# Initialize Terraform +terraform init -upgrade + +# Apply modules in sequence +for target in "${targets[@]}" +do + echo "Applying module $target..." + apply_output=$(terraform apply -target="$target" -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of $target completed successfully" + else + echo "FAILED: Terraform apply of $target failed" + exit 1 + fi +done + +# Final apply to catch any remaining resources +echo "Applying remaining resources..." +apply_output=$(terraform apply -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of all modules completed successfully" +else + echo "FAILED: Terraform apply of all modules failed" + exit 1 +fi diff --git a/ai-ml/jark-stack/terraform/main.tf b/ai-ml/jark-stack/terraform/main.tf new file mode 100644 index 000000000..bbbb966cd --- /dev/null +++ b/ai-ml/jark-stack/terraform/main.tf @@ -0,0 +1,38 @@ +provider "aws" { + region = local.region +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} +provider "kubectl" { + apply_retry_count = 30 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + load_config_file = false +} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +locals { + name = var.name + region = var.region + azs = ["${local.region}c", "${local.region}d"] + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/jark-stack/terraform/outputs.tf b/ai-ml/jark-stack/terraform/outputs.tf new file mode 100644 index 000000000..f6444daab --- /dev/null +++ b/ai-ml/jark-stack/terraform/outputs.tf @@ -0,0 +1,4 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" +} diff --git a/ai-ml/jark-stack/terraform/src/app/Dockerfile b/ai-ml/jark-stack/terraform/src/app/Dockerfile new file mode 100644 index 000000000..afa9fc5ee --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/app/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.8-slim + +RUN groupadd --gid 1000 appuser \ + && useradd --uid 1000 --gid 1000 -ms /bin/bash appuser + +RUN pip3 install --no-cache-dir --upgrade \ + pip \ + virtualenv + +RUN apt-get update && apt-get install -y + +USER appuser +WORKDIR /home/appuser + +ENV VIRTUAL_ENV=/home/appuser/venv +RUN virtualenv ${VIRTUAL_ENV} +RUN . ${VIRTUAL_ENV}/bin/activate && \ + pip install requests streamlit Pillow + +EXPOSE 8501 + +COPY streamlit.py /home/appuser/ +COPY run.sh /home/appuser +ENTRYPOINT ["./run.sh"] diff --git a/ai-ml/jark-stack/terraform/src/app/run.sh b/ai-ml/jark-stack/terraform/src/app/run.sh new file mode 100644 index 000000000..0c52b8acc --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/app/run.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +APP_PID= +stopRunningProcess() { + # Based on https://linuxconfig.org/how-to-propagate-a-signal-to-child-processes-from-a-bash-script + if test ! "${APP_PID}" = '' && ps -p ${APP_PID} > /dev/null ; then + > /proc/1/fd/1 echo "Stopping ${COMMAND_PATH} which is running with process ID ${APP_PID}" + + kill -TERM ${APP_PID} + > /proc/1/fd/1 echo "Waiting for ${COMMAND_PATH} to process SIGTERM signal" + + wait ${APP_PID} + > /proc/1/fd/1 echo "All processes have stopped running" + else + > /proc/1/fd/1 echo "${COMMAND_PATH} was not started when the signal was sent or it has already been stopped" + fi +} + +trap stopRunningProcess EXIT TERM + +source ${VIRTUAL_ENV}/bin/activate + +streamlit run ${HOME}/streamlit_app.py & +APP_ID=${!} + +wait ${APP_ID} diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.py b/ai-ml/jark-stack/terraform/src/app/streamlit.py new file mode 100644 index 000000000..2448e114e --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/app/streamlit.py @@ -0,0 +1,33 @@ +import streamlit as st +import requests +from urllib.parse import urlencode +from PIL import Image +import tempfile + + +### Update Hostname before building image +base_url="" + +st.title("Welcome to dogbooth! :dog:") +st.header("_a place to create images of [v]dog in beautiful scenes._") + + +prompt = st.chat_input("a photo of a [v]dog ...") +if prompt: + query_params = { + "prompt": prompt + } + encoded_query = urlencode(query_params) + image_url = f"{base_url}?{encoded_query}" + + with st.spinner("Wait for it..."): + response = requests.get(image_url, timeout=180) + + if response.status_code == 200: + content_size = len(response.content) + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(response.content) + st.image(Image.open(f.name), caption=prompt) + st.balloons() + else: + st.error(f"Failed to download image. Status code: {response.status_code}") diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.yaml b/ai-ml/jark-stack/terraform/src/app/streamlit.yaml new file mode 100644 index 000000000..ae3fc0df5 --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/app/streamlit.yaml @@ -0,0 +1,83 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: dogbooth-app +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: streamlit-deployment + namespace: dogbooth-app + labels: + app: streamlit +spec: + replicas: 1 + selector: + matchLabels: + app: streamlit + template: + metadata: + labels: + app: streamlit + spec: + containers: + - name: streamlit + image: public.ecr.aws/h3o5n2r0/gen-ai-demo/dogbooth-app:0.0.2 + imagePullPolicy: Always + ports: + - containerPort: 8501 + livenessProbe: + httpGet: + path: /_stcore/health + port: 8501 + scheme: HTTP + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /_stcore/health + port: 8501 + scheme: HTTP + timeoutSeconds: 1 + resources: + limits: + cpu: 1 + memory: 2Gi + requests: + cpu: 100m + memory: 745Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: streamlit-service + namespace: dogbooth-app +spec: + type: ClusterIP + selector: + app: streamlit + ports: + - name: streamlit-port + protocol: TCP + port: 8501 + targetPort: 8501 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: dogbooth-app + namespace: dogbooth-app + annotations: + nginx.ingress.kubernetes.io/rewrite-target: "/$1" +spec: + ingressClassName: nginx + rules: + - http: + paths: + - path: /dogbooth/app/(.*) + pathType: ImplementationSpecific + backend: + service: + name: streamlit-service + port: + number: 8501 diff --git a/ai-ml/jark-stack/terraform/src/notebook/Dockerfile b/ai-ml/jark-stack/terraform/src/notebook/Dockerfile new file mode 100644 index 000000000..da5c32dd7 --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/notebook/Dockerfile @@ -0,0 +1,10 @@ +FROM cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + +USER root + +RUN conda install -c anaconda libstdcxx-ng + +RUN cp /opt/conda/lib/libstdc++.so.6.0.31 /usr/lib/x86_64-linux-gnu/ && \ + cd /usr/lib/x86_64-linux-gnu && \ + rm -f libstdc++.so.6 && \ + ln -s libstdc++.so.6.0.31 libstdc++.so.6 diff --git a/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb b/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb new file mode 100644 index 000000000..0e54206b9 --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb @@ -0,0 +1,275 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Verify NVIDIA GPU is visible\n", + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.chdir(\"/home/jovyan\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Clone the diffusers repo\n", + "!git clone https://github.com/huggingface/diffusers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Change the directory\n", + "os.chdir(\"diffusers\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Install requirements\n", + "! pip install -e .\n", + "! pip install xformers==0.0.16 diffusers[torch]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Fix for bitsandbytes https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md\n", + "! wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh\n", + "! bash cuda_install.sh 117 ~/local 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Install bitsandbytes for optimizations\n", + "! pip install bitsandbytes==0.41.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Use the newly installed CUDA version for bitsandbytes\n", + "os.environ[\"BNB_CUDA_VERSION\"] = \"117\"\n", + "os.environ[\"LD_LIBRARY_PATH\"] = os.getenv(\"LD_LIBRARY_PATH\") + \":/home/jovyan/local/cuda-11.7\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Validate successful install of bitsandbytes\n", + "! python -m bitsandbytes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Install requirements for dreambooth\n", + "os.chdir(\"examples/dreambooth\")\n", + "! pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Setup default configuration for accelerate\n", + "! accelerate config default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Login to huggingface associated with your account (please create one if it doesn't exist)\n", + "! huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Download sample dataset of the subject. See the sample images here https://huggingface.co/datasets/diffusers/dog-example\n", + "from huggingface_hub import snapshot_download\n", + "\n", + "local_dir = \"./dog\"\n", + "snapshot_download(\n", + " \"diffusers/dog-example\",\n", + " local_dir=local_dir, repo_type=\"dataset\",\n", + " ignore_patterns=\".gitattributes\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Export environment variables to provide input model, dataset directory and output directory for the tuned model\n", + "os.environ[\"MODEL_NAME\"] = \"stabilityai/stable-diffusion-2-1\"\n", + "os.environ[\"INSTANCE_DIR\"] = \"dog\"\n", + "os.environ[\"OUTPUT_DIR\"] = \"dogbooth\"\n", + "os.environ[\"RESOLUTION\"] = \"768\"\n", + "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"garbage_collection_threshold:0.6,max_split_size_mb:128\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Launch the training and push the output model to huggingface\n", + "! accelerate launch train_dreambooth.py \\\n", + " --pretrained_model_name_or_path=$MODEL_NAME \\\n", + " --instance_data_dir=$INSTANCE_DIR \\\n", + " --output_dir=$OUTPUT_DIR \\\n", + " --instance_prompt=\"a photo of [v]dog\" \\\n", + " --resolution=768 \\\n", + " --train_batch_size=1 \\\n", + " --gradient_accumulation_steps=1 \\\n", + " --gradient_checkpointing \\\n", + " --learning_rate=1e-6 \\\n", + " --lr_scheduler=\"constant\" \\\n", + " --enable_xformers_memory_efficient_attention \\\n", + " --use_8bit_adam \\\n", + " --lr_warmup_steps=0 \\\n", + " --max_train_steps=800 \\\n", + " --push_to_hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Run a sample inference\n", + "from diffusers import StableDiffusionPipeline\n", + "import torch\n", + "\n", + "model_id = \"./dogbooth\"\n", + "pipe = StableDiffusionPipeline.from_pretrained(model_id).to(\"cuda\")\n", + "\n", + "prompt = \"a photo of [v]dog on the moon\"\n", + "image = pipe(prompt, num_inference_steps=100, guidance_scale=7.5).images[0]\n", + "\n", + "image.save(\"dog-bucket.png\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ai-ml/jark-stack/terraform/src/service/Dockerfile b/ai-ml/jark-stack/terraform/src/service/Dockerfile new file mode 100644 index 000000000..1f348eaeb --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/service/Dockerfile @@ -0,0 +1,8 @@ +FROM rayproject/ray-ml:2.6.0-gpu + +RUN pip install -U \ + git+https://github.com/huggingface/transformers diffusers + +WORKDIR /serve_app + +COPY dogbooth.py /serve_app/dogbooth.py diff --git a/ai-ml/jark-stack/terraform/src/service/dogbooth.py b/ai-ml/jark-stack/terraform/src/service/dogbooth.py new file mode 100644 index 000000000..f3b42b474 --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/service/dogbooth.py @@ -0,0 +1,58 @@ +from io import BytesIO +from fastapi import FastAPI +from fastapi.responses import Response +import torch +import os +from ray import serve + + +app = FastAPI() + + +@serve.deployment(num_replicas=1, route_prefix="/") +@serve.ingress(app) +class APIIngress: + def __init__(self, diffusion_model_handle) -> None: + self.handle = diffusion_model_handle + + @app.get( + "/imagine", + responses={200: {"content": {"image/png": {}}}}, + response_class=Response, + ) + async def generate(self, prompt: str, img_size: int = 768): + assert len(prompt), "prompt parameter cannot be empty" + + image_ref = await self.handle.generate.remote(prompt, img_size=img_size) + image = await image_ref + file_stream = BytesIO() + image.save(file_stream, "PNG") + return Response(content=file_stream.getvalue(), media_type="image/png") + + +@serve.deployment( + ray_actor_options={"num_gpus": 1}, + autoscaling_config={"min_replicas": 1, "max_replicas": 2}, +) +class StableDiffusionV2: + def __init__(self): + from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline + + model_id = os.getenv('MODEL_ID') + + scheduler = EulerDiscreteScheduler.from_pretrained( + model_id, subfolder="scheduler" + ) + self.pipe = StableDiffusionPipeline.from_pretrained( + model_id, scheduler=scheduler + ) + self.pipe = self.pipe.to("cuda") + + def generate(self, prompt: str, img_size: int = 768): + assert len(prompt), "prompt parameter cannot be empty" + + image = self.pipe(prompt, height=img_size, width=img_size).images[0] + return image + + +entrypoint = APIIngress.bind(StableDiffusionV2.bind()) diff --git a/ai-ml/jark-stack/terraform/src/service/ray-service.yaml b/ai-ml/jark-stack/terraform/src/service/ray-service.yaml new file mode 100644 index 000000000..a72a2c904 --- /dev/null +++ b/ai-ml/jark-stack/terraform/src/service/ray-service.yaml @@ -0,0 +1,99 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: dogbooth +--- +apiVersion: ray.io/v1alpha1 +kind: RayService +metadata: + name: dogbooth + namespace: dogbooth +spec: + serviceUnhealthySecondThreshold: 600 + deploymentUnhealthySecondThreshold: 600 + serveConfig: + importPath: dogbooth:entrypoint + runtimeEnv: | + env_vars: {"MODEL_ID": "askulkarni2/dogbooth"} + rayClusterConfig: + rayVersion: '2.6.0' + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: public.ecr.aws/h3o5n2r0/dogbooth:0.0.1-gpu + resources: + limits: + cpu: 2 + memory: 16Gi + nvidia.com/gpu: 1 + requests: + cpu: 2 + memory: 16Gi + nvidia.com/gpu: 1 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: 5 + rayStartParams: {} + groupName: small-group + template: + spec: + containers: + - name: ray-worker + image: public.ecr.aws/h3o5n2r0/dogbooth:0.0.1-gpu + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: "2" + memory: "16Gi" + nvidia.com/gpu: 1 + requests: + cpu: "2" + memory: "16Gi" + nvidia.com/gpu: 1 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: dogbooth + namespace: dogbooth + annotations: + nginx.ingress.kubernetes.io/rewrite-target: "/$1" +spec: + ingressClassName: nginx + rules: + - http: + paths: + # Ray Dashboard + - path: /dogbooth/(.*) + pathType: ImplementationSpecific + backend: + service: + name: dogbooth-head-svc + port: + number: 8265 + # Ray Serve + - path: /dogbooth/serve/(.*) + pathType: ImplementationSpecific + backend: + service: + name: dogbooth-head-svc + port: + number: 8000 diff --git a/ai-ml/jark-stack/terraform/variables.tf b/ai-ml/jark-stack/terraform/variables.tf new file mode 100644 index 000000000..b95ca832d --- /dev/null +++ b/ai-ml/jark-stack/terraform/variables.tf @@ -0,0 +1,40 @@ +variable "name" { + description = "Name of the VPC and EKS Cluster" + default = "jark-stack" + type = string +} + +# NOTE: Trainium and Inferentia are only available in us-west-2 and us-east-1 regions +variable "region" { + description = "region" + default = "us-west-2" + type = string +} + +variable "eks_cluster_version" { + description = "EKS Cluster version" + default = "1.27" + type = string +} + +# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs +variable "vpc_cidr" { + description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range" + default = "10.1.0.0/21" + type = string +} + +# RFC6598 range 100.64.0.0/10 +# Note you can only /16 range to VPC. You can add multiples of /16 if required +variable "secondary_cidr_blocks" { + description = "Secondary CIDR blocks to be attached to VPC" + default = ["100.64.0.0/16"] + type = list(string) +} + +variable "huggingface_token" { + description = "Hugging Face Secret Token" + type = string + default = "DUMMY_TOKEN_REPLACE_ME" + sensitive = true +} diff --git a/ai-ml/jark-stack/terraform/versions.tf b/ai-ml/jark-stack/terraform/versions.tf new file mode 100644 index 000000000..bb085ae7a --- /dev/null +++ b/ai-ml/jark-stack/terraform/versions.tf @@ -0,0 +1,37 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.72" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.4.1" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.14" + } + random = { + source = "hashicorp/random" + version = ">= 3.1" + } + http = { + source = "hashicorp/http" + version = ">= 3.3" + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "doeks-github-actions-e2e-test-state" + # region = "us-west-2" + # key = "e2e/trainium-inferentia/terraform.tfstate" + # } +} diff --git a/ai-ml/jark-stack/terraform/vpc.tf b/ai-ml/jark-stack/terraform/vpc.tf new file mode 100644 index 000000000..59c3da89c --- /dev/null +++ b/ai-ml/jark-stack/terraform/vpc.tf @@ -0,0 +1,53 @@ +locals { + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] +} + +#--------------------------------------------------------------- +# VPC +#--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.name + cidr = var.vpc_cidr + azs = local.azs + + # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods + secondary_cidr_blocks = var.secondary_cidr_blocks + + # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods + # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. + private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) + + # ------------------------------ + # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments + # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW + public_subnets = local.public_subnets + enable_nat_gateway = true + single_nat_gateway = true + #------------------------------- + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + # Tags subnets for Karpenter auto-discovery + "karpenter.sh/discovery" = local.name + } + + tags = local.tags +} diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf index 1ca3f0f22..277cf7889 100755 --- a/ai-ml/jupyterhub/addons.tf +++ b/ai-ml/jupyterhub/addons.tf @@ -1,7 +1,3 @@ -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - # Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM) data "aws_acm_certificate" "issued" { count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 @@ -16,6 +12,7 @@ data "aws_ecrpublic_authorization_token" "token" { locals { cognito_custom_domain = var.cognito_custom_domain } + #--------------------------------------------------------------- # IRSA for EBS CSI Driver #--------------------------------------------------------------- @@ -69,7 +66,7 @@ module "eks_blueprints_addons" { enable_cluster_proportional_autoscaler = true cluster_proportional_autoscaler = { timeout = "300" - values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", { + values = [templatefile("${path.module}/helm/coredns-autoscaler/values.yaml", { target = "deployment/coredns" })] description = "Cluster Proportional Autoscaler for CoreDNS Service" @@ -81,7 +78,7 @@ module "eks_blueprints_addons" { enable_metrics_server = true metrics_server = { timeout = "300" - values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] + values = [templatefile("${path.module}/helm/metrics-server/values.yaml", {})] } #--------------------------------------- @@ -91,7 +88,7 @@ module "eks_blueprints_addons" { cluster_autoscaler = { timeout = "300" create_role = true - values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { + values = [templatefile("${path.module}/helm/cluster-autoscaler/values.yaml", { aws_region = var.region, eks_cluster_id = module.eks.cluster_name })] @@ -109,15 +106,158 @@ module "eks_blueprints_addons" { } #--------------------------------------- - # CloudWatch metrics for EKS + # AWS Load Balancer Controller #--------------------------------------- - enable_aws_cloudwatch_metrics = true - aws_cloudwatch_metrics = { - timeout = "300" - values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] + enable_aws_load_balancer_controller = true + + #--------------------------------------- + # Prometheus and Grafana stack + #--------------------------------------- + #--------------------------------------------------------------- + # Install Monitoring Stack with Prometheus and Grafana + # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` + # 2- Grafana Admin user: admin + # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` + #--------------------------------------------------------------- + enable_kube_prometheus_stack = true + kube_prometheus_stack = { + values = [templatefile("${path.module}/helm/kube-prometheus-stack/values.yaml", {})] + chart_version = "48.1.1" + set_sensitive = [ + { + name = "grafana.adminPassword" + value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string + } + ], + } + #--------------------------------------- + # AWS for FluentBit + #--------------------------------------- + enable_aws_for_fluentbit = true + aws_for_fluentbit_cw_log_group = { + use_name_prefix = false + name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group + retention_in_days = 30 + } + aws_for_fluentbit = { + values = [templatefile("${path.module}/helm/aws-for-fluentbit/values.yaml", { + region = local.region, + cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" + cluster_name = module.eks.cluster_name + })] } - enable_aws_load_balancer_controller = true + #--------------------------------------- + # Additional Helm Charts + #--------------------------------------- + helm_releases = { + storageclass = { + name = "storageclass" + description = "A Helm chart for storage configurations" + chart = "${path.module}/helm/storageclass" + } + karpenter-resources-cpu = { + name = "karpenter-resources-cpu" + description = "A Helm chart for karpenter CPU based resources" + chart = "${path.module}/helm/karpenter-resources" + values = [ + <<-EOT + clusterName: ${module.eks.cluster_name} + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + EOT + ] + } + karpenter-resources-ts = { + name = "karpenter-resources-ts" + description = "A Helm chart for karpenter GPU based resources - compatible with GPU time slicing" + chart = "${path.module}/helm/karpenter-resources" + values = [ + <<-EOT + name: gpu-ts + clusterName: ${module.eks.cluster_name} + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + instanceSizes: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] + instanceFamilies: ["g5"] + taints: + - key: hub.jupyter.org/dedicated + value: "user" + effect: "NoSchedule" + - key: nvidia.com/gpu + effect: "NoSchedule" + amiFamily: Ubuntu + EOT + ] + } + karpenter-resources-mig = { + name = "karpenter-resources-gpu" + description = "A Helm chart for karpenter GPU based resources - compatible with P4d instances" + chart = "${path.module}/helm/karpenter-resources" + values = [ + <<-EOT + name: gpu + clusterName: ${module.eks.cluster_name} + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + instanceSizes: ["24xlarge"] + instanceFamilies: ["p4d"] + taints: + - key: hub.jupyter.org/dedicated + value: "user" + effect: "NoSchedule" + - key: nvidia.com/gpu + effect: "NoSchedule" + amiFamily: Ubuntu + EOT + ] + } + karpenter-resources-inf = { + name = "karpenter-resources-inf" + description = "A Helm chart for karpenter Inferentia based resources" + chart = "${path.module}/helm/karpenter-resources" + values = [ + <<-EOT + name: inferentia + clusterName: ${module.eks.cluster_name} + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + instanceSizes: ["8xlarge", "24xlarge"] + instanceFamilies: ["inf2"] + taints: + - key: aws.amazon.com/neuroncore + value: "true" + effect: "NoSchedule" + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + - key: hub.jupyter.org/dedicated + value: "user" + effect: "NoSchedule" + EOT + ] + } + karpenter-resources-trn = { + name = "karpenter-resources-trn" + description = "A Helm chart for karpenter Trainium based resources" + chart = "${path.module}/helm/karpenter-resources" + values = [ + <<-EOT + name: trainium + clusterName: ${module.eks.cluster_name} + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + instanceSizes: ["32xlarge"] + instanceFamilies: ["trn1"] + taints: + - key: aws.amazon.com/neuroncore + value: "true" + effect: "NoSchedule" + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + - key: hub.jupyter.org/dedicated + value: "user" + effect: "NoSchedule" + EOT + ] + } + } tags = local.tags } @@ -131,19 +271,25 @@ module "eks_data_addons" { oidc_provider_arn = module.eks.oidc_provider_arn + #--------------------------------------------------------------- + # Enable Neuron Device Plugin + #--------------------------------------------------------------- + enable_aws_neuron_device_plugin = true + #--------------------------------------------------------------- # Enable GPU operator #--------------------------------------------------------------- - enable_nvidia_gpu_operator = var.jupyter_notebook_support == "gpu" ? true : false + enable_nvidia_gpu_operator = true nvidia_gpu_operator_helm_config = { - values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})] + values = [templatefile("${path.module}/helm/nvidia-gpu-operator/values.yaml", {})] } + #--------------------------------------------------------------- # JupyterHub Add-on #--------------------------------------------------------------- enable_jupyterhub = true jupyterhub_helm_config = { - values = [templatefile("${path.module}/helm-values/jupyterhub-values-${var.jupyter_hub_auth_mechanism}-${var.jupyter_notebook_support}.yaml", { + values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", { ssl_cert_arn = try(data.aws_acm_certificate.issued[0].arn, "") jupyterdomain = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "") authorize_url = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "") @@ -151,7 +297,45 @@ module "eks_data_addons" { userdata_url = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "") client_id = try(aws_cognito_user_pool_client.user_pool_client[0].id, "") client_secret = try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "") + user_pool_id = try(aws_cognito_user_pool.pool[0].id, "") + identity_pool_id = try(aws_cognito_identity_pool.identity_pool[0].id, "") jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name + region = var.region })] } + + #--------------------------------------------------------------- + # Kubecost Add-on + #--------------------------------------------------------------- + enable_kubecost = true + kubecost_helm_config = { + values = [templatefile("${path.module}/helm/kubecost/values.yaml", {})] + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } +} + +#--------------------------------------------------------------- +# Grafana Admin credentials resources +#--------------------------------------------------------------- +data "aws_secretsmanager_secret_version" "admin_password_version" { + secret_id = aws_secretsmanager_secret.grafana.id + depends_on = [aws_secretsmanager_secret_version.grafana] +} + +resource "random_password" "grafana" { + length = 16 + special = true + override_special = "@_" +} + +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "grafana" { + name_prefix = "${local.name}-grafana-" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "grafana" { + secret_id = aws_secretsmanager_secret.grafana.id + secret_string = random_password.grafana.result } diff --git a/ai-ml/jupyterhub/cleanup.sh b/ai-ml/jupyterhub/cleanup.sh index 4412881ce..8438ddf84 100755 --- a/ai-ml/jupyterhub/cleanup.sh +++ b/ai-ml/jupyterhub/cleanup.sh @@ -2,13 +2,11 @@ set -o errexit set -o pipefail -read -p "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com :" acm_certificate_domain -read -p "Enter sub-domain name for jupyterhub to be hosted, e.g. eks.example.com : " jupyterhub_domain - targets=( "module.eks_data_addons" "module.eks_blueprints_addons" "module.eks" + "module.vpc" ) #------------------------------------------- @@ -32,7 +30,7 @@ done #------------------------------------------- for target in "${targets[@]}" do - destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty) + destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then echo "SUCCESS: Terraform destroy of $target completed successfully" else @@ -44,7 +42,7 @@ done #------------------------------------------- # Terraform destroy full #------------------------------------------- -destroy_output=$(terraform destroy -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -auto-approve | tee /dev/tty) +destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then echo "SUCCESS: Terraform destroy of all targets completed successfully" else diff --git a/ai-ml/jupyterhub/cognito.tf b/ai-ml/jupyterhub/cognito.tf new file mode 100644 index 000000000..57338986b --- /dev/null +++ b/ai-ml/jupyterhub/cognito.tf @@ -0,0 +1,224 @@ +#--------------------------------------------------------------- +# Lambda function for pre token generation +#---------------------------------------------------------------- + +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com", "cognito-idp.amazonaws.com"] + } + actions = ["sts:AssumeRole"] + } +} + +data "aws_iam_policy" "lambda_execution_policy" { + arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +resource "aws_iam_role" "iam_for_lambda" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "iam_for_lambda" + assume_role_policy = data.aws_iam_policy_document.assume_role.json +} + +resource "aws_iam_role_policy_attachment" "lambda_policy_attachment" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + role = aws_iam_role.iam_for_lambda[0].name + policy_arn = data.aws_iam_policy.lambda_execution_policy.arn +} + +data "archive_file" "lambda" { + type = "zip" + output_path = "/tmp/lambda.zip" + source { + filename = "index.mjs" + content = <<-EOF + export const handler = async (event) => { + event.response = { + claimsOverrideDetails: { + claimsToAddOrOverride: { + department: "engineering", + }, + }, + }; + + return event; + }; + + EOF + } +} + +resource "aws_lambda_function" "pretoken_trigger" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + function_name = "pretoken-trigger-function" + filename = data.archive_file.lambda.output_path + source_code_hash = data.archive_file.lambda.output_base64sha256 + + runtime = "nodejs18.x" + handler = "index.handler" + + role = aws_iam_role.iam_for_lambda[0].arn +} + +#--------------------------------------------------------------- +# Cognito pool, domain and client creation. +# This can be used +# Auth integration later. +#---------------------------------------------------------------- +resource "aws_cognito_user_pool" "pool" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "jupyterhub-userpool" + + username_attributes = ["email"] + auto_verified_attributes = ["email"] + + password_policy { + minimum_length = 6 + } + + lambda_config { + pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn + } +} + +resource "aws_cognito_user_pool_domain" "domain" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + domain = local.cognito_custom_domain + user_pool_id = aws_cognito_user_pool.pool[0].id +} + +resource "aws_cognito_user_pool_client" "user_pool_client" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "jupyter-client" + access_token_validity = 1 + token_validity_units { + access_token = "days" + } + callback_urls = ["https://${var.jupyterhub_domain}/hub/oauth_callback"] + user_pool_id = aws_cognito_user_pool.pool[0].id + allowed_oauth_flows_user_pool_client = true + allowed_oauth_flows = ["code"] + allowed_oauth_scopes = ["openid", "email"] + generate_secret = true + supported_identity_providers = [ + "COGNITO" + ] + + depends_on = [aws_cognito_user_pool_domain.domain] +} + +#--------------------------------------------------------------- +# Cognito identity pool creation. +#---------------------------------------------------------------- +resource "aws_cognito_identity_pool" "identity_pool" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_name = "jupyterhub-identity-pool" + allow_unauthenticated_identities = false + cognito_identity_providers { + client_id = aws_cognito_user_pool_client.user_pool_client[0].id + provider_name = aws_cognito_user_pool.pool[0].endpoint + server_side_token_check = true + } + + depends_on = [aws_cognito_user_pool_client.user_pool_client] +} + +resource "aws_s3_bucket" "jupyterhub_bucket" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket_prefix = "jupyterhub-test-bucket-" +} + +resource "aws_s3_object" "engineering_object" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket = aws_s3_bucket.jupyterhub_bucket[0].id + key = "engineering/" +} + +resource "aws_s3_object" "legal_object" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket = aws_s3_bucket.jupyterhub_bucket[0].id + key = "legal/" +} + +#--------------------------------------------------------------- +# IAM role for a team member from the engineering department +# In theory there would be other departments such as "legal" +#---------------------------------------------------------------- +resource "aws_iam_role" "cognito_authenticated_engineering_role" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + + name = "EngineeringTeamRole" + + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"], + Effect = "Allow", + Principal = { + Federated = "cognito-identity.amazonaws.com" + }, + Condition = { + StringEquals = { + "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id + }, + "ForAnyValue:StringLike" : { + "cognito-identity.amazonaws.com:amr" : "authenticated" + } + } + } + ] + }) +} + +resource "aws_iam_role_policy" "s3_cognito_engineering_policy" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "s3_cognito_engineering_policy" + role = aws_iam_role.cognito_authenticated_engineering_role[0].id + + policy = <<-EOF +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["s3:List*"], + "Resource": "*", + "Condition": { + "StringEquals": { + "s3:prefix": "$${aws:PrincipalTag/department}" + } + } + } + ] +} +EOF +} + +resource "aws_cognito_identity_pool_provider_principal_tag" "example" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id + identity_provider_name = aws_cognito_user_pool.pool[0].endpoint + use_defaults = false + principal_tags = { + department = "department" + } +} + +resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "S3ReadOnlyAccessAttachment" + policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + roles = [aws_iam_role.cognito_authenticated_engineering_role[0].name] +} + +resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id + roles = { + authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn + } +} diff --git a/ai-ml/jupyterhub/examples/create_image.sh b/ai-ml/jupyterhub/examples/create_image.sh new file mode 100755 index 000000000..ae33fcb7e --- /dev/null +++ b/ai-ml/jupyterhub/examples/create_image.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Set the AWS region and the name of the ECR repository + +REGION=us-west-2 +ECR_REPO_NAME=jupyterhub-pytorch-neuron-pytorch +DOCKER_FILE=docker/jupyterhub-pytorch-neuron-pytorch.Dockerfile + +# Check if the ECR repository exists +if aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$REGION" >/dev/null 2>&1; then + echo "ECR repository '$ECR_REPO_NAME' already exists." + + # Get the ECR_REPO_URI for the existing repository + ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text) + echo "Repository URL: $ECR_REPO_URI" +else + # Create a new ECR repository with the specified name and region + aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$REGION" + + # Retrieve the URL of the newly created ECR repository + ECR_REPO_URI=$(aws ecr describe-repositories --repository-name "$ECR_REPO_NAME" --query 'repositories[0].repositoryUri' --region "$REGION" --output text) + echo "Repository URL: $ECR_REPO_URI" +fi + +# Log in to Amazon ECR using docker +echo -e "Logging in to Amazon ECR..." +aws ecr get-login-password --region "$REGION" | docker login --username AWS --password-stdin "$ECR_REPO_URI" + +# Build the docker image using the provided jupyterhub-pytorch-neuron.Dockerfile and tag it with the ECR repository URI +echo -e "Building, tagging and pushing docker image... $ECR_REPO_URI:latest" +# docker build -f docker/jupyterhub-pytorch-neuron.Dockerfile-jupterhub-inferentia-pytorch -t "$ECR_REPO_URI:latest" . +docker buildx build --push --tag "$ECR_REPO_URI:latest" -o type=image --platform=linux/amd64 -f $DOCKER_FILE . + +# Wait for 5 seconds +sleep 5 +echo -e "Sleeping for 5 seconds..." diff --git a/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile b/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile new file mode 100644 index 000000000..687e7a52f --- /dev/null +++ b/ai-ml/jupyterhub/examples/docker/jupyterhub-pytorch-neuron.Dockerfile @@ -0,0 +1,34 @@ +# Use the Jupyter base notebook with Python 3.10 as the base image +FROM jupyter/base-notebook:python-3.10 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Install gnupg and other required packages +RUN apt-get update -y && \ + apt-get install -y gnupg git g++ + +RUN \ + . /etc/os-release && \ + echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + apt-get update -y && \ + apt-get install aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y + +# Switch back to jovyan user for Python package installations +USER jovyan + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install transformers-neuronx sentencepiece transformers wget awscli ipywidgets neuronx-cc==2.* torch-neuronx torchvision ipykernel environment_kernels && \ + # Install new Jupyter Notebook kernel + python -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH diff --git a/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile b/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile new file mode 100644 index 000000000..6f167444d --- /dev/null +++ b/ai-ml/jupyterhub/examples/docker/jupyterhub-tensorflow-neuron.Dockerfile @@ -0,0 +1,34 @@ +# Use the Jupyter base notebook with Python 3.10 as the base image +FROM jupyter/base-notebook:python-3.10 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Install gnupg and other required packages +RUN apt-get update -y && \ + apt-get install -y gnupg git g++ + +RUN \ + . /etc/os-release && \ + echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + apt-get update -y && \ + apt-get install aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y + +# Switch back to jovyan user for Python package installations +USER jovyan + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install transformers-neuronx sentencepiece transformers wget awscli ipywidgets neuronx-cc==2.* tensorflow-neuronx ipykernel environment_kernels && \ + # Install new Jupyter Notebook kernel + python -m ipykernel install --user --name aws_neuron_venv_tensorflow --display-name "Python (tensorflow-neuronx)" + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH diff --git a/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb b/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb new file mode 100644 index 000000000..75b7e55e3 --- /dev/null +++ b/ai-ml/jupyterhub/examples/notebook-examples/gpu-timeslice-test-tensorflow.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "851f73c7-068a-4e44-861b-511a3d2caf16", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting tensorrt\n", + " Downloading tensorrt-8.6.1.post1.tar.gz (18 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hBuilding wheels for collected packages: tensorrt\n", + " Building wheel for tensorrt (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for tensorrt: filename=tensorrt-8.6.1.post1-py2.py3-none-any.whl size=17281 sha256=055cb554c81337084ecbeece43281f85702addd8902e31b0004f43a2fb65f518\n", + " Stored in directory: /home/jovyan/.cache/pip/wheels/f4/c8/0e/b79b08e45752491b9acfdbd69e8a609e8b2ed7640dda5a3e59\n", + "Successfully built tensorrt\n", + "Installing collected packages: tensorrt\n", + "Successfully installed tensorrt-8.6.1.post1\n", + "Requirement already satisfied: matplotlib in /opt/conda/lib/python3.10/site-packages (3.7.1)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (23.0)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.0.7)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (9.4.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n", + "Requirement already satisfied: numpy>=1.20 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (1.23.5)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib) (4.39.3)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], + "source": [ + "! python3 -m pip install --upgrade tensorrt\n", + "! pip3 install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5c0d4d12-6c12-4cdc-bb79-6ad2a808e7a1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIhCAYAAAB5deq6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC8DklEQVR4nOzdd3hU1dbH8e9k0iskQAo19N6LIFJUkCJSVBBEFCzYrhUUyxWxYb+814ZXsQAqohQBqSpNpUrviHQSEkoSIH3mvH8cEghJIAkzmUny+zxPnsycnDl7nZmTZNbsvde2GIZhICIiIiIiIvnycHUAIiIiIiIi7k6Jk4iIiIiIyBUocRIREREREbkCJU4iIiIiIiJXoMRJRERERETkCpQ4iYiIiIiIXIESJxERERERkStQ4iQiIiIiInIFSpxERERERESuQImTSBny1VdfYbFYsFgsLFu2LNfPDcOgdu3aWCwWunTpUqQ2Pv74Y7766qtCPWbZsmX5xlQUxXGeL7/8MhaLJce2/M79wIEDWCyWQj8vWbp06ZIrTovFwssvv1yk4xXEG2+8wezZs3Ntd/RrVRj33HNP9utqsVjw8fGhXr16jB07ltTUVIe2deDAAXr37k1oaCgWi4UnnnjCoceXC5KSknj99ddp3bo1wcHB+Pj4UKNGDUaMGMGGDRuy97v499piseDp6UmVKlUYPnw4R48ezbXf+vXr82zv5ptvpkaNGs4+rasyb948+vbtS1RUFN7e3gQFBdGiRQvGjh3LoUOHcuzbpUuXHM+Ln58fzZo1Y8KECdjt9hz7NW7cOM/2Tpw44fS/KSKlgaerAxCR4hcUFMSkSZNyvRlfvnw5+/btIygoqMjH/vjjj6lQoQL33HNPgR/TsmVLVq1aRcOGDYvcbl6ceZ55ye/cIyMjWbVqFbVq1XJYW6tWraJKlSoOO96l3njjDW677Tb69euXY7uzXquC8vPz47fffgPg9OnTfPfdd7zyyivs2rWL77//3mHtPPnkk6xZs4YvvviCiIgIIiMjHXZsuWDfvn10796duLg4HnzwQcaNG0dgYCAHDhxg+vTptGrVioSEBEJCQrIf8+WXX1K/fn1SUlJYsWIF48ePZ/ny5WzdupWAgAAXns3Vs9vtDB8+nMmTJ9OzZ0/Gjx9PjRo1SElJYd26dXz55Zd88cUXHD58OMfjatasyTfffANAXFwcEydO5MknnyQmJoa33nrLFaciUiopcRIpgwYNGsQ333zDRx99RHBwcPb2SZMm0b59e5KSkooljoyMDCwWC8HBwVxzzTUOP767nKePj4/Dz88Zz1dBOOu1KigPD48c7ffs2TP7Tfb7779P5cqVi3xswzBITU3Fz8+Pbdu20bZt21yJY1HZbDYyMzPx8fFxyPFKA5vNRv/+/Tlx4gSrVq3K0RvSuXNn7r77bhYsWICXl1eOxzVu3JjWrVsD0LVrV2w2G6+++iqzZ8/mzjvvLNZzcLS33nqLyZMnM378eMaMGZPjZz169OC5557j008/zfU4Pz+/XL8X9evX58MPP+S1117L9RyKSNFoqJ5IGTR48GAAvvvuu+xtiYmJzJgxgxEjRuT5mHHjxtGuXTtCQ0MJDg6mZcuWTJo0CcMwsvepUaMG27dvZ/ny5dnDRrKGxGQN8ZoyZQpPP/00lStXxsfHh7///jvX8K8TJ05QtWpVOnToQEZGRvbxd+zYQUBAAHfddZdTzjO/YWgFGWp3uXPP6/FZQ/02btzIgAEDCA4OJiQkhKFDhxIfH3/Fc8trWM3Ro0d54IEHqFq1Kt7e3kRFRXHbbbdx/PhxAFJTU3n66adp3rw5ISEhhIaG0r59e3766adcxz537hxff/119rlk9drl9xzNmTOH9u3b4+/vT1BQEN26dWPVqlU59sk65+3btzN48GBCQkIIDw9nxIgRJCYmXvGc85P1hvHgwYOAOfRr1KhRREdH4+3tTeXKlXniiSc4d+5crvN89NFHmThxIg0aNMDHxyf7nP/++28WLFiQff4HDhwA4NChQwwdOpRKlSrh4+NDgwYNeO+993IMicp6vd9++21ee+01oqOj8fHxYenSpdnPwZYtW7j99tuzX4ennnqKzMxMdu/eTY8ePQgKCqJGjRq8/fbbOWIu6Gt48flNmTKFBg0a4O/vT7NmzZg3b16ufXft2sXgwYMJDw/Hx8eHatWqMWzYMNLS0rL3iY2NZeTIkVSpUgVvb2+io6MZN24cmZmZRXrdZs+ezdatW3nuuefyHULWs2dP/P39L3ucS19/R+jXrx/Vq1fP8bpmadeuHS1btsy+/8MPP9CuXTtCQkLw9/enZs2a+f4dvZz09HTefvttGjdunCtpyuLp6ckjjzxyxWN5eXnRqlUrkpOTC/T3REQKRomTSBkUHBzMbbfdxhdffJG97bvvvsPDw4NBgwbl+ZgDBw4wcuRIpk+fzsyZMxkwYAD/+te/ePXVV7P3mTVrFjVr1qRFixasWrWKVatWMWvWrBzHee655zh06BATJ05k7ty5VKpUKVdbFSpUYNq0aaxbt45nn30WgOTkZG6//XaqVavGxIkTnXaeRVWQc89L//79qV27Nj/++CMvv/wys2fP5qabbsqRMBbE0aNHadOmDbNmzeKpp55iwYIFTJgwgZCQEE6fPg1AWloap06dYtSoUcyePZvvvvuOjh07MmDAACZPnpx9rFWrVuHn50evXr2yz+Xjjz/Ot+1vv/2Wvn37EhwczHfffcekSZM4ffo0Xbp04ffff8+1/6233krdunWZMWMGY8aM4dtvv+XJJ58s1Ple7O+//wagYsWKJCcn07lzZ77++msee+wxFixYwLPPPstXX33FLbfckiPRB/PN+yeffMJLL73EokWLaN++PatWrSIiIoJrr702+/wjIyOJj4+nQ4cOLF68mFdffZU5c+Zw4403MmrUKB599NFccf33v//lt99+491332XBggXUr18/+2cDBw6kWbNmzJgxg/vvv5///Oc/PPnkk/Tr14/evXsza9Ysrr/+ep599llmzpyZ/biCvoZZfv75Zz788ENeeeUVZsyYQWhoKP379+eff/7J3mfz5s20adOG1atX88orr7BgwQLGjx9PWloa6enpgJk0tW3blkWLFvHSSy+xYMEC7r33XsaPH8/999+fo82suWhZyWZ+Fi9eDHDVvXoXv/6OMmLECA4dOpQ9LDTLrl27WLt2LcOHDwfM35VBgwZRs2ZNpk2bxs8//8xLL71UpGRy/fr1JCQk0KdPH4ecw759+/D09KR8+fIOOZ6IAIaIlBlffvmlARjr1q0zli5dagDGtm3bDMMwjDZt2hj33HOPYRiG0ahRI6Nz5875HsdmsxkZGRnGK6+8YoSFhRl2uz37Z/k9Nqu9Tp065fuzpUuX5tj+1ltvGYAxa9Ys4+677zb8/PyMLVu2OO0884tj//79BmB8+eWX2dvGjh1rXPonNL9zv9zjn3zyyRz7fvPNNwZgTJ06NXtb586dcx0XMMaOHZt9f8SIEYaXl5exY8eOfJ6V3DIzM42MjAzj3nvvNVq0aJHjZwEBAcbdd9+d6zGXPkc2m82IiooymjRpYthstuz9zpw5Y1SqVMno0KFDrnN+++23cxzz4YcfNnx9fXNcR3m5++67jYCAACMjI8PIyMgw4uPjjf/7v/8zLBaL0aZNG8MwDGP8+PGGh4eHsW7duhyP/fHHHw3AmD9/fvY2wAgJCTFOnTqVq63q1asbvXv3zrFtzJgxBmCsWbMmx/aHHnrIsFgsxu7duw3DuPB616pVy0hPT8+xb9Zz8N577+XY3rx5cwMwZs6cmb0tIyPDqFixojFgwIB8n5PLvYaAER4ebiQlJWVvi42NNTw8PIzx48dnb7v++uuNcuXKGXFxcfm2M3LkSCMwMNA4ePBgju3vvvuuARjbt2/P3jZixAjDarUaBw4cyPd4hmEYPXr0MAAjNTX1svtlyfq9Xr16tZGRkWGcOXPGmDdvnlGxYkUjKCjIiI2NzbHfpddAlt69exvVq1e/bFsZGRlGeHi4MWTIkBzbn3nmGcPb29s4ceKEYRgXzj8hIaFA53A506ZNMwBj4sSJecZz8dfFOnfubDRq1Cj7Z8eOHcu+Vm+//fZc++UlPj4+198UEcmtTPc4rVixgj59+hAVFYXFYsmzgpQjZQ3RuPgrIiLCqW2K5Kdz587UqlWLL774gq1bt7Ju3brLDi/57bffuPHGGwkJCcFqteLl5cVLL73EyZMniYuLK3C7t956a4H3HT16NL1792bw4MF8/fXXfPDBBzRp0qTAj4fCn2dxu3ROxsCBA/H09GTp0qWFOs6CBQvo2rUrDRo0uOx+P/zwA9deey2BgYF4enri5eXFpEmT2LlzZ6FjB9i9ezfHjh3jrrvuwsPjwr+UwMBAbr31VlavXk1ycnKOx9xyyy057jdt2pTU1NQCXUfnzp3Dy8sLLy8vKlasyBNPPEHPnj2ze/fmzZtH48aNad68OZmZmdlfN910U55DDK+//voCfyL/22+/0bBhQ9q2bZtj+z333INhGLl6J2655ZZ855bcfPPNOe43aNAAi8VCz549s7d5enpSu3btXEPQCvMadu3aNUcRlPDwcCpVqpR9zOTkZJYvX87AgQMv22Mzb948unbtSlRUVI7nNSve5cuXZ+87adIkMjMzqV69er7HuxrXXHMNXl5eBAUFcfPNNxMREcGCBQsIDw93WBuenp4MHTqUmTNnZg8jtdlsTJkyhb59+xIWFgZAmzZtAPP3dvr06Tmq+zlKQkJC9jWf9XVpxcDt27dn/ywqKor33nuPO++8k88++8zh8YiUZWU6cTp37hzNmjXjww8/LLY2GzVqRExMTPbX1q1bi61tkYtZLBaGDx/O1KlTmThxInXr1uW6667Lc9+1a9fSvXt3AD777DP++OMP1q1bxwsvvABASkpKgdstTHUyi8XCPffcQ2pqKhEREQWe23TpMQp6nq5w6Ycnnp6ehIWFcfLkyUIdJz4+/opV9mbOnMnAgQOpXLkyU6dOZdWqVdmJZFHLeWfFmdfrGhUVhd1uzx4qmCXrTWeWrIIJBbmO/Pz8WLduHevWrWPLli0kJCTw888/ZxeFOH78OFu2bMn1RjMoKAjDMDhx4kSO4xXmejx58mS+55n184IeOzQ0NMd9b29v/P398fX1zbX94temsK/hpc81mM931nN9+vRpbDbbFa+d48ePM3fu3FzPa6NGjQByPa8FUa1aNQD2799fqMdNnjyZdevWsXHjRo4dO8aWLVu49tprs3/u6WnWvbLZbHk+PjMzs0DFErKe02nTpgGwaNEiYmJisofpAXTq1InZs2eTmZnJsGHDqFKlCo0bN84xr7Kgsp6PSxPloKCg7Gt+7NixeT62Vq1arFu3jvXr17Nt2zYSEhKYOnVqjmqEnp6el31OABWRELmCMl1Vr2fPnjk+3btUeno6L774It988w0JCQk0btyYt956q8jrvoD5h0u9TOIu7rnnHl566SUmTpzI66+/nu9+06ZNw8vLi3nz5uV4Y1eUXtpL1z66nJiYGB555BGaN2/O9u3bGTVqFP/9738L3WZBzzPr3C6eEA9Fe1NYULGxsTkqwWVmZnLy5Mk83/BeTsWKFTly5Mhl95k6dSrR0dF8//33OV6HS8+3MLLijImJyfWzY8eO4eHh4dA5Fh4eHtkV1fJSoUIF/Pz8csxru/TnFyvM9RgWFpbveV7tsQvK0a9haGgoVqv1itdOhQoVaNq0ab6/P1nJY2HcdNNN/O9//2P27Nn5FkPIS4MGDS57DWT1POXX+3P06NEC9U5l9S5++eWXjBw5ki+//JKoqKjsD5Gy9O3bl759+5KWlsbq1asZP348Q4YMoUaNGrRv377A59WqVSvKly/P3LlzeeONN7K3W63W7PPdtm1bno/19fW97HMC5vOybt06DMPIdW1mPVeO7LUTKY3KdI/TlQwfPpw//viDadOmZVdA6tGjB3v37i3yMffu3UtUVBTR0dHccccdOSboihS3ypUrM3r0aPr06cPdd9+d735Zi01ardbsbSkpKUyZMiXXvhd/mn01bDYbgwcPxmKxZE9W/+CDD3JMlC+ogp5nVhW8LVu25Ng+Z86cArVTlHPPWnsly/Tp08nMzCz0BzQ9e/Zk6dKl7N69O999LBYL3t7eOd40xcbG5lmRraDnUq9ePSpXrsy3336bo/DCuXPnmDFjRnalveJy8803s2/fPsLCwmjdunWur6tZ+PSGG25gx44dORZlBbMHxGKx0LVr16uM/soK8xoWhJ+fH507d+aHH3647AcEN998M9u2baNWrVp5Pq9FSZz69u1LkyZNGD9+fL4JwaJFi3IN9bySa665hsDAwDzX9dqxYwfbt2/nxhtvLNCxhg8fzpo1a/j999+ZO3cud999d46/gxfz8fGhc+fO2esmbdy4sVBxe3t7M3r0aLZt2+aUtZduvPFGkpKSWLhwYa6fTZ8+HQ8PD66//nqHtytSmpTpHqfL2bdvH9999x1HjhzJ/ocwatQoFi5cyJdffpnj06CCateuHZMnT6Zu3bocP36c1157jQ4dOrB9+/ZCf7os4ihvvvnmFffp3bs377//PkOGDOGBBx7g5MmTvPvuu3muSdOkSROmTZvG999/T82aNfH19S30vCSAsWPHsnLlShYvXkxERARPP/00y5cv595776VFixZER0cX6ngFOc+IiAhuvPFGxo8fT/ny5alevTq//vprgZO1opz7zJkz8fT0pFu3bmzfvp1///vfNGvWjIEDBxaozSxZ1dA6derE888/T5MmTUhISGDhwoU89dRT1K9fn5tvvpmZM2fy8MMPc9ttt3H48GFeffVVIiMjc30g1KRJE5YtW8bcuXOJjIwkKCiIevXq5WrXw8ODt99+mzvvvJObb76ZkSNHkpaWxjvvvENCQkKBnndHeuKJJ5gxYwadOnXiySefpGnTptjtdg4dOsTixYt5+umnadeuXZGO/eSTTzJ58mR69+7NK6+8QvXq1fn555/5+OOPeeihh6hbt66Dzya3wryGBfX+++/TsWNH2rVrx5gxY6hduzbHjx9nzpw5fPrppwQFBfHKK6+wZMkSOnTowGOPPUa9evVITU3lwIEDzJ8/n4kTJ2YP97v33nv5+uuv2bdv32XnOVmtVmbNmkX37t1p3749Dz30EF27diUgIICDBw/y448/Mnfu3FxDPa8kKCiIcePG8fTTT2O32xk0aBDly5dn69atvPHGG1SvXp3HHnusQMcaPHgwTz31FIMHDyYtLS3X4tYvvfQSR44c4YYbbqBKlSokJCTwf//3f3h5edG5c+fs/Tw9PencuTO//vrrZdt79tln2bVrF2PGjGHFihUMGjSIGjVqkJaWxj///MPnn3+O1Wot0ocRd955Jx9//DEDBw5kzJgxtGnThpSUFObPn89nn33Gv/71L2rWrFno44qUKa6tTeE+OF+5K8v06dMNwAgICMjx5enpaQwcONAwjAuVky739cgjj+Tb5tmzZ43w8PBc1ZVEnOVK1aay5FUd7osvvjDq1atn+Pj4GDVr1jTGjx9vTJo0yQCM/fv3Z+934MABo3v37kZQUJABZFevyqrG9sMPP+Rq79JKbYsXLzY8PDxyVXg6efKkUa1aNaNNmzZGWlqaU84zJibGuO2224zQ0FAjJCTEGDp0qLF+/foCVdXL79wvV1Xvr7/+Mvr06WMEBgYaQUFBxuDBg43jx4/nOG5BquoZhmEcPnzYGDFihBEREWF4eXkZUVFRxsCBA3Mc78033zRq1Khh+Pj4GA0aNDA+++yzPM9l06ZNxrXXXmv4+/sbQHb7+VUenD17ttGuXTvD19fXCAgIMG644Qbjjz/+yLFPVjvx8fE5tme9XhdfR3nJqqp3JWfPnjVefPFFo169eoa3t7cREhJiNGnSxHjyySezK68ZhnHZv9F5VdUzDMM4ePCgMWTIECMsLMzw8vIy6tWrZ7zzzjs5Kgpmvd7vvPNOrsfn9xzkd255VUIr6GuY3/lVr149V8XEHTt2GLfffrsRFhZmeHt7G9WqVTPuueeeHBXv4uPjjccee8yIjo42vLy8jNDQUKNVq1bGCy+8YJw9ezbHuRTk9cySkJBgvPrqq0bLli2NwMBAw8vLy6hWrZoxdOjQHNdQQX+vs0yfPt3o2LGjERQUZHh6ehrVqlUzHnrooRzXQEEMGTLEAIxrr70218/mzZtn9OzZ06hcubLh7e1tVKpUyejVq5excuXKHPtd/DtUEHPmzDH69OljhIeHG56enkZQUJDRvHlz4+mnnzZ27dqVY9/LVcu7VFJSkvHMM88YderUMby9vQ1/f3+jdevWxsSJE69Y1VJEDMNiGJcsalFGWSwWZs2alb2exPfff8+dd97J9u3bc3XLBwYGEhERQUZGBvv27bvsccuXL3/ZMcPdunWjdu3afPLJJ1d9DiJScrz88suMGzeO+Pj4XHNjRERExP1oqF4+WrRogc1mIy4uLt8KXF5eXjkWNCystLQ0du7c6VYVvkREREREJLcynTidPXs2e8VxMEuibtq0idDQUOrWrcudd97JsGHDeO+992jRogUnTpzgt99+o0mTJvTq1avQ7Y0aNYo+ffpQrVo14uLieO2110hKSrrsZHUREREREXG9Mj1Ub9myZXlWQbr77rv56quvyMjI4LXXXmPy5MkcPXqUsLAw2rdvz7hx44o02f2OO+5gxYoVnDhxgooVK3LNNdfw6quv0rBhQ0ecjoiIiIiIOEmZTpxEREREREQKQus4iYiIiIiIXIESJxERERERkSsoc8Uh7HY7x44dIygoKMfK6yIiIiIiUrYYhsGZM2eIiorCw+PyfUplLnE6duwYVatWdXUYIiIiIiLiJg4fPkyVKlUuu0+ZS5yCgoIA88kJDg52cTRSVBkZGSxevJju3bvj5eXl6nCklNP1JsVN15wUN11zUpzc6XpLSkqiatWq2TnC5ZS5xClreF5wcLASpxIsIyMDf39/goODXf4LJ6WfrjcpbrrmpLjpmpPi5I7XW0Gm8Kg4hIiIiIiIyBUocRIREREREbkCJU4iIiIiIiJXUObmOImIiIiIlFSGYZCZmYnNZnN1KEWWkZGBp6cnqampxXIeXl5eWK3Wqz6OEicRERERkRIgPT2dmJgYkpOTXR3KVTEMg4iICA4fPlws66paLBaqVKlCYGDgVR1HiZOIiIiIiJuz2+3s378fq9VKVFQU3t7exZJ0OIPdbufs2bMEBgZecdHZq2UYBvHx8Rw5coQ6depcVc+TEicRERERETeXnp6O3W6natWq+Pv7uzqcq2K320lPT8fX19fpiRNAxYoVOXDgABkZGVeVOKk4hIiIiIhICVEciUZp46ieOT3zIiIiIiIiV6DESURERERE5Ao0x0lEREREpIyw2Q3W7j9F3JlUKgX50jY6FKtHySwyUdzU4yQiIiIiUgYs3BZDx7d+Y/Bnq3l82iYGf7aajm/9xsJtMU5vOzY2ln/961/UrFkTPz8/GjVqxC233MKvv/4KQI0aNbBYLFgsFvz9/WncuDGffvpp9uNffvllmjdvnuu4CQkJWCwWli1b5vRzUOIkIiIiIlLKLdwWw0NTNxCTmJpje2xiKg9N3eDU5OnAgQO0atWK3377jbfffpvNmzfz448/0rVrVx555JHs/V555RViYmLYsmUL/fr148EHH+T77793WlyFpaF6IiIiUuJp+JGURYZhkJJhu+J+NrvB2DnbMfI6BmABXp6zg2trVyjQ742fl7VQleoefvhhLBYLa9euJSAgALvdTlJSEu3atePee+/N3i8oKIiIiAgAXnvtNaZPn87s2bMZNGhQgdtyJiVOIiIiUqIt3BbDuLk7cnySHhniy9g+DenRONKFkYk4V0qGjYYvLbrq4xhAbFIqTV5eXKD9d7xyE/7eBUsjTp06xcKFC3n99dcJCAjI9fNy5crl+1hfX18yMjIK1E5x0FA9ERERKbFcOfxIRK7s77//xjAM6tevX+DHZGZm8tVXX7F161ZuuOEGJ0ZXOOpxEhERkRLJZjcYN3fHZYcfjZu7g24NIzRsT0olPy8rO1656Yr7rd1/inu+XHfF/b4a3oa20aEFaregDMP8DS3I0L5nn32WF198kbS0NLy9vRk9ejQjR44scFvOpsRJRERESqS1+0/l6mm6mAHEJKaydv8p2tcKK77ARIqJxWIp0JC56+pUJDLEl9jE1Dw/aLAAESG+XFenosM/ZKhTpw4Wi4WdO3fSr1+/y+47evRo7rnnHvz9/YmMjMyRbAUHB5OYmJjrMQkJCQCEhIQ4Muw8aaieiIiIlEhxZ/JPmoqyn0hpZfWwMLZPQ8BMki6WdX9sn4ZO6ZkNDQ3lpptu4qOPPuLcuXO5fp6V+ABUqFCB2rVrExUVlauHqn79+hw5coTY2Ngc29etW4eHhwe1a9d2eOyXUuIkIiIiJVKlIF+H7idSmvVoHMknQ1sSEZLz9yEixJdPhrZ0aiGVjz/+GJvNRtu2bZkxYwZ79+5l9+7dfPDBB7Rv375Ax+jevTsNGjTgjjvu4I8//mD//v389NNPjBo1igcffJCgoCCnxZ9FQ/VERESkRGobHVqg4UcFmbMhUhb0aBxJt4YRxV66Pzo6mg0bNvD666/z9NNPExMTQ4UKFWjVqhWffPJJgY7h6enJ4sWLef7557nzzjuJi4ujevXq3HfffTzzzDNOjT87hmJpRURERMTBsoYfPTh1Q777OGv4kUhJZfWwuGTOX2RkJB9++CEffvhh9jpOwcHBeHiYA+AOHDhwxWNERETwxRdfODnS/GmonoiIiJRYPRpH0rNxRK7tXlaL04cfiUjZosRJRERESiy73WDrUbPS1r+ur824W8wJ8Jk2g3bRqqQnIo6jxElERERKrPUHT3PkdAqBPp483KU2d3eIpl54EAawYm+8q8MTkVJEiZOIiIiUWLM2HgGgR+MI/LzNRTm71K8IwPLdSpxExHGUOImIiEiJlJphY96WGAAGtKicvb1L3UoALN8Tj92eV709EZHCU+IkIiIiJdLSXXGcSc0kMsSXa2pemM/UukZ5An08OXkunW3HEl0YoYiUJkqcREREpESaufEoALc0j8LjopLjXlYPrq1tJlLLNFxPRBxEiZOIiIiUOKfPpbNsdxwAA1pUyfXzLvXM4XpZ+4iIXC0lTiIiIlLizNsaQ4bNoGFkMPUignL9vEs9s0DEpsMJnD6XXtzhiUgppMRJRERESpxZG8xqev0vKgpxscgQP+qFB2E3VJZcJAe7DfavhK0/mt/tNldHVGIocRIREZES5eDJc2w4lICHBfo2j8p3P5UlF7nEjjkwoTF8fTPMuNf8PqGxud3JYmNjefzxx6lduzb+/v7UrVuXTp06MXHiRJKTkwGoUaMGFosFi8WCv78/jRs35tNPP80+xssvv0zz5s1zHTshIQGLxcKyZcuceg5KnERERKREmXW+KMS1tStQKdg33/1UllzkIjvmwPRhkHQs5/akGHO7E5Onf/75hxYtWrB48WLeeOMN/vrrL2bNmsXjjz/O3Llz+eWXX7L3feWVV4iJiWHLli3069ePBx98kO+//95psRWGp6sDEBERESkowzCyE6f8hullubQsedMq5YohQpFiZBiQkXzl/ew2WPAMkNcHCAZggYXPQs0u4GG98vG8/MFiufJ+5z388MN4enqyfv16AgICsNvtJCUl0b59e26//XYM40JcQUFBREREAPDaa68xffp0Zs+ezaBBgwrcnrMocRIREZESY+PhBA6eTMbPy8pNjSIuu29WWfJF24+zbHe8EicpfTKS4Y38h6sWnGH2RL1ZtWC7P38MvAMKtOvJkyeze5oCAvJ+jOUySZivry8ZGRkFi8vJNFRPRERESoxZG8zeph6NIwjwufLnvypLLuJaf//9N4ZhUK9evRzba9WqRXBwMIGBgTz77LO5HpeZmclXX33F1q1bueGGG4or3MtSj5OIiIiUCOmZduZuMedn9LvCML0sWWXJN54vS14+wNtp8YkUOy9/s/fnSg7+Cd/cduX97vwRqncoWLuFdGmv0q+//oq/vz933XUXaWlp2dufffZZXnzxRdLS0vD29mb06NGMHDmy0O05gxInERERKRGW74knITmDikE+XFsrrECPiQzxo35EELtiz7Bibzx9mxcs4RIpESyWgg2Zq3U9BEeZhSDynOdkMX9e6/qCzXEqhNq1a2OxWNi1a1eO7TVq1CA4OBg/P78c20ePHs0999yDv78/kZGRORKu4OBgEhMTc7WRkJAAQEhIiENjv5SG6omIiEiJMGujuXZT32ZReFoL/hamcz2VJZcyzsMKPd46f+fS+UTn7/d40+FJE0BYWBjdunXjww8/5Ny5c1fcv0KFCtSuXZuoqKhcvVT169fnyJEjxMbG5ti+bt06PDw8qF27tkNjv5QSJxEREXF7iSkZ/LLTnKdU0GF6WVSWXARoeAsMnAzBkTm3B0eZ2xve4rSmP/74YzIzM2ndujXff/89O3fuZO/evUydOpVdu3ZhtRYsYevevTsNGjTgjjvu4I8//mD//v389NNPjBo1igcffJCgoCCnnQNoqJ6IiIiUAAu2xpCeaadueCCNooIL9ViVJRc5r+EtUL+3Oefp7HEIDDfnNDmhp+litWrVYuPGjbzxxhs899xzHDlyBB8fHxo2bMioUaN4+OGHC3QcT09PFi9ezPPPP8+dd95JXFwc1atX57777uOZZ55x6jmAEicREREpAWZmr91U5bKli/OisuQiF/GwQvR1xd5sZGQkH3zwAR988EH2Ok7BwcF4eFwYAHfgwIErHiciIoIvvvjCiZHmT0P1RERExK0dPpXM2v2nsFigb/OirVmjsuQicrWUOImIiIhbm7PZLLd8TXQYUeX8rrB33i4tSy4iUlhKnERERMRtGYbBzA1mNb3+LYteSjyrLLlhwIq9qq4nIoWnxElERETc1tajieyLP4ePpwc9G0dc1bFUllxEroYSJxEREXFbs84XhejWMJwgX6+rOpbKkktpYBi6dgvLUc+ZEicRERFxS5k2O3PPz28acBXD9LJcWpZcpCTx8jI/OEhOTnZxJCVPero5r7Gg60XlR+XIRURExC2t3HuCE2fTCQvw5ro6Fa/6eCpLLiWZ1WqlXLlyxMWZlSH9/f0LXZrfXdjtdtLT00lNTc1RjtxZbcXHx+Pv74+n59WlPkqcRERExC1lDdPr0ywKL6tj3lx1qVeJRduPs3R3HI/dUMchxxQpLhER5jy/rOSppDIMg5SUFPz8/Iol+fPw8KBatWpX3ZYSJxEREXE7Z9MyWbwjFoD+La5+mF6WrLLkm86XJS8f4O2wY4s4m8ViITIykkqVKpGRkeHqcIosIyODFStW0KlTp+whiM7k7e3tkJ4tJU4iIiLidhZsjSE1w07NigE0rRLisONmlSXfFXuGFXvj6dvccUmZSHGxWq1XPV/HlaxWK5mZmfj6+hZL4uQoKg4hIiIibmf2JnOYXv/mlR0+lEdlyUWkKJQ4iYiIiFuJSUzhz30nAejnwGF6WVSWXESKQomTiIiIuJWfNh3DMKBtjVCqhvo7/PgqSy4iRaHESURERNzK7PPV9JzR2wQXypIDLNNwPREpICVOIiIi4jZ2HEtiV+wZvK0e9G4S6bR2utQzh+st3V2yyzqLSPFR4iQiIiJuY9bGIwDc0KASIf7Oq7Z1aVlyEZErUeIkIiIibsFmN/hp0zHAecP0smSVJTcMWLFXw/VE5MqUOImIiIhb+HPfCeLOpFHO34uu54fSOZPKkotIYShxEhEREbcwa4NZFOLmppF4ezr/LYrKkotIYShxEhEREZdLTs9k4fZYAPo7eZheFpUlF5HCUOIkIiIiLrd4+3GS021UD/OnZbXyxdLmxWXJl+7ScD0RuTwlTiIiIuJyM7PWbmpeGYvFUmztZs2lWrZHZclF5PKUOImIiIhLxZ1J5ffzle2cXU3vUp1VllxECkiJk4iIiLjUnE3HsBvQolo5oisEFGvbKksuIgWlxElERERcatb5YXoDirm3KYvKkotIQShxEhEREZfZc/wM248l4elhoXfTKJfEoLLkIlIQSpxERETEZbJ6m7rUq0RogLdLYlBZchEpCCVOIiIi4hJ2u8FPWcP0WrpmmB6YZck71q4AqCy5iORPiZOIiIi4xJr9pziWmEqQryfX16/k0li6nJ/npLLkIpIfJU4iIiLiErM2HgGgd5NIfL2sLo1FZclF5EqUOImIiEixS82wsWBrLAD9XVRN72IqSy4iV6LESURERIrdLzuPcyYtk8rl/GhTI9TV4QAqSy4il6fESURERIrdrA1mUYh+LaLw8LC4OBqTypKLyOUocRIREZFidfJsGsv3mL067jBML8vFZcm3HlVZchHJyaWJ05kzZ3jiiSeoXr06fn5+dOjQgXXr1l32McuXL6dVq1b4+vpSs2ZNJk6cWEzRioiIiCPM2xJDpt2gSeUQalcKcnU42S4uS75Mw/VE5BIuTZzuu+8+lixZwpQpU9i6dSvdu3fnxhtv5OjRo3nuv3//fnr16sV1113Hxo0bef7553nssceYMWNGMUcuIiIiRTXz/NpN7tTblEVlyUUkPy5LnFJSUpgxYwZvv/02nTp1onbt2rz88stER0fzySef5PmYiRMnUq1aNSZMmECDBg247777GDFiBO+++24xRy8iIiJFsS/+LJsPJ2D1sNCnWZSrw8lFZclFJD+ermo4MzMTm82Gr69vju1+fn78/vvveT5m1apVdO/ePce2m266iUmTJpGRkYGXl1eux6SlpZGWlpZ9PykpCYCMjAwyMjKu9jTERbJeO72GUhx0vUlxK83X3My/DgPQsVYY5Xw93O4cK/h7Ui88kN3Hz7J0Vyx9mka6OqRiUZqvOXE/7nS9FSYGlyVOQUFBtG/fnldffZUGDRoQHh7Od999x5o1a6hTp06ej4mNjSU8PDzHtvDwcDIzMzlx4gSRkbn/uI0fP55x48bl2r548WL8/f0dczLiMkuWLHF1CFKG6HqT4lbarjnDgGkbrYCF6hxn/vz5rg4pT1WsHuzGg++WbsZ6ZKOrwylWpe2aE/fmDtdbcnJygfd1WeIEMGXKFEaMGEHlypWxWq20bNmSIUOGsGHDhnwfY7HkLFlqGEae27M899xzPPXUU9n3k5KSqFq1Kt27dyc4ONgBZyGukJGRwZIlS+jWrVuePY0ijqTrTYpbab3m1h88zcnV6wjwtjLqjhvw87a6OqQ8he0/xa9frGdfig89enRxm3LpzlRarzlxT+50vWWNRisIlyZOtWrVYvny5Zw7d46kpCQiIyMZNGgQ0dHRee4fERFBbGxsjm1xcXF4enoSFhaW52N8fHzw8fHJtd3Ly8vlL5RcPb2OUpx0vUlxK23X3NytxwHo0TiS4ADfK+ztOu1qVSTQx5NT5zLYFZdMs6rlXB1SsSlt15y4N3e43grTvlus4xQQEEBkZCSnT59m0aJF9O3bN8/92rdvn6tLb/HixbRu3drlT7qIiIjkLy3Txs9bYgAY0NL9quldTGXJRSQvLk2cFi1axMKFC9m/fz9Lliyha9eu1KtXj+HDhwPmMLthw4Zl7//ggw9y8OBBnnrqKXbu3MkXX3zBpEmTGDVqlKtOQURERApg6a44ElMyiAj25ZqaeY8ScScqSy4il3Jp4pSYmMgjjzxC/fr1GTZsGB07dmTx4sXZvUcxMTEcOnQoe//o6Gjmz5/PsmXLaN68Oa+++ir//e9/ufXWW111CiIiIlIAs86v3dS3eRTWEjBnSGXJReRSLp3jNHDgQAYOHJjvz7/66qtc2zp37nzZ4hEiIiLiXhKS0/ltl9lz09/Nh+lliQzxo35EELtiz7Bibzx9m5eMuEXEedxijpOIiIiUXvO2xJBhM2gQGUz9iJJT0Tar12m55jmJCEqcRERExMlmnx+m179FlIsjKZwudSsBsHxPPHa74eJoRMTVlDiJiIiI0xw6mcz6g6fxsFDihru1rlGeQB9PTp5LZ+vRRFeHIyIupsRJREREnCarKMS1tSsQHuy+azflRWXJReRiSpxERETEKQzDYPYmM3HqV8J6m7KoLLmIZFHiJCIiIk6x6XAC+0+cw8/LSo/GEa4Op0hUllxEsihxEhEREafIGqZ3U6NwAnxcugJKkWWVJTcMWLFXw/VEyjIlTiIiIuJwGTY7czcfA6Bfi5I5TC+LypKLCChxEhERESdYvjue08kZVAj0yS6wUFJ1raey5CKixElEREScIGuYXt/mUXhaS/bbjVbVyxOksuQiZV7J/ksmIiIibicpNYMlO48D0L+ED9MDsyz5tSpLLlLmKXESERERh1qwNYb0TDt1KgXSKCrY1eE4hMqSi4gSJxEREXGomRvMYXr9W1bGYrG4OBrHUFlyEVHiJCIiIg5zNCGFNftPAdC3hC56mxeVJRcRJU4iIiLiMLPPF4W4pmYolcv5uTgax8rqddI8J5GySYmTiIiIOIRhGNnV9Aa0qOLiaBwvqyz5CpUlFymTlDiJiIiIQ2w/lsTfcWfx8fSgR5MIV4fjcCpLLlK2KXESERERh8gqCnFjw3CCfb1cHI3jqSy5SNmmxElERESuWqbNzpzNxwAYUArWbsqPypKLlF1KnEREROSq/f73CU6cTSM0wJtOdSu6OhynUVlykbJLiZOIiIhctayiEH2aRuJlLb1vL1SWXKTsKr1/2URERKRYnE3LZNH2WAD6tyx91fQu1eV8dT3NcxIpW5Q4iYiIyFVZtC2W1Aw70RUCaFYlxNXhOF3WPCeVJRcpW5Q4iYiIyFXJGqbXv0VlLBaLi6NxPpUlFymblDiJiIhIkcUmpvLHvhMA9GteeqvpXUxlyUXKJiVOIiIiUmRzNh/FMKB19fJUC/N3dTjFRmXJRcoeJU4iIiJSZFmL3vZvWTZ6m7JcXJb8lMqSi5QJSpxERESkSHbGJLEr9gzeVg9ubhLl6nCK1cVlyVeqLLlImaDESURERIpk9vmiENfXr0SIv5eLoyl+KksuUrYocRIREZFCs9kNZm8yE6d+LcrWML0sKksuUrYocRIREZFCW7XvJMeT0gjx86Jr/YquDsclVJZcpGxR4iQiIiKFlrV2081NI/HxtLo4GtdQWXKRskWJk4iIiBRKSrqNhdtiAHPR27JMZclFyg4lTiIiIlIoi3fEci7dRrVQf1pVL+/qcFxKZclFyg4lTiIiIlIoWcP0+rWojMVicXE0rqWy5CJlhxInERERKbD4M2ms3HsC0DC9LCpLLlI2KHESERGRApuz+Rg2u0HzquWIrhDg6nDcgsqSi5QNSpxERESkwLIWvR3QUr1NWVSWXKRsUOIkIiIiBfJ33Bm2Hk3E08PCzU2jXB2O21BZcpGyQYmTiIiIFMjMDWZvU5d6FQkN8HZxNO5FZclFSj8lTiIiInJFdrvBT5uOAdC/RRUXR+N+VJZcpPRT4iQiIiJXtPbAKY4mpBDk48kNDSq5Ohy3o7LkIqWfEicRERG5olnnh+n1ahKJr5fVxdG4J5UlFyndlDiJiIjIZaVm2Ji/NQaA/qqmly+VJRcp3ZQ4iYiIyGX9ujOOM2mZVC7nR9saoa4Ox22pLLlI6abESURERC5r1sYjAPRtHoWHh8XF0bgvlSUXKd2UOImIiEi+Tp5Ny04C+rfQML0rUVlykdJLiZOIiIjk6+etMWTaDRpXDqZOeJCrw3F7WQUiVJZcpPRR4iQiIiL5ylr0Vms3FUxEiK/KkouUUkqcREREJE//xJ9l0+EErB4WbmkW5epwSgyVJRcpnZQ4iYiISJ5mbzoGQMfaFagY5OPiaEoOlSUXKZ2UOImIiEguhmEwe6M5TG+A1m4qFJUlFymdlDiJiIhILn8dPM2hU8kEeFvp3jDC1eGUKCpLLlI6KXESERGRXGad7226qXEEft5WF0dT8mQN11u6W2XJRUoLJU4iIiKSQ1qmjXlbYgAYoGp6RZJVIGLzEZUlFyktlDiJiIhIDkt3xZOYkkF4sA/ta4W5OpwSSWXJRUofJU4iIiKSQ1ZRiL7NK2P1sLg4mpJLZclFShclTiIiIpItMTmD33aZ83L6tyhB1fTsNti/Erb+aH6321wdkcqSi5Qynq4OQERERNzHvK3HSLfZqR8RRIPIYFeHUzA75sDCZyHp2IVtwVHQ4y1oeIvLwrq0LHmzquVcFouIXD31OImIiEi2rGF6Jaa3acccmD4sZ9IEkBRjbt8xxzVxobLkIqWNEicREREB4PCpZNYdOI3FYs5vcnt2m9nTRF7D4M5vWzjGpcP2utZXWXKR0kKJk4iIiAAX1m66tlYFIkJ8XRxNARz8M3dPUw4GJB0193ORznVVllyktFDiJCIiIhiGkT1Mr19JGaZ39rhj93MClSUXKT2UOImIiAibjyTyz4lz+Hp50KNxhKvDKZjAcMfu5yQqSy5SOihxEhEREWZtOALATY0iCPQpIUV3q3cwq+flywLBlc39XEhlyUVKByVOIiIiZVyGzc7cLTFACRqmB+BhhXYPXmYHA3q8ae7nQpeWJReRkkmJk4iISBm3Yk88p86lUyHQh+vOl88uEez2C+XGvfxy/7z6dS5dxymLypKLlA5KnERERMq4meeLQtzSLApPawl6a7D5Wzi6HrwD4ZH1cPc8uHUS9Hrf/Pnh1eZ6Tm5AZclFSr4S9NdRREREHC0pNYNfdphV50rMorcAKadhyVjzdpcxUK4KRF8HTW6DtvdCtfZgz4B1n7k2zvNUllyk5FPiJCIiUoYt3BpLWqad2pUCaVw52NXhFNzSNyD5BFSol/c8p/aPmN/XfwHp54o3tjyoLLlIyafESUREpAybudGspte/RWUsFouLoymgmC2w7nPzdq93wOqVe596vaB8DbNnavN3xRpeflSWXKRkU+IkIiJSRh1NSGH1P6cA6Nv8cmW93YhhwPzRYNihUX+o2Tnv/TyscM3D5u1VH5uFJFxMZclFSjYlTiIiImXUT5vMohDtokOpUt7fxdEU0OZpZtEHrwDo/vrl921+J/iEwKl9sHdR8cR3GSpLLlKyKXESEREpgwzDYNYGM3Ea0LKEFIVITYQlL5m3O4+GkCvE7RMIre8xb//5oVNDKwgvqwcd65hlyVVdT6TkcWnilJmZyYsvvkh0dDR+fn7UrFmTV155BftlutOXLVuGxWLJ9bVr165ijFxERKRk234sib1xZ/H29KBH40hXh1Mwy96Ec3EQVhuueaRgj2k7Ejw84eDvcGyjc+MrgKzheprnJFLyeLqy8bfeeouJEyfy9ddf06hRI9avX8/w4cMJCQnh8ccfv+xjd+/eTXDwheo/FStWdHa4IiIipcas82s3dWsQTohfHsUV3M3x7bDmU/N2z7fB07tgjwupDI0GwNbp5lynW11bnvzSsuShAQU8DxFxOZf2OK1atYq+ffvSu3dvatSowW233Ub37t1Zv379FR9bqVIlIiIisr+sVmsxRCwiIlLyZdrs/LTpGFBC1m7KLghhgwZ9oPYNhXt8+/NFIrbPhMSjjo+vEFSWXKTkcmmPU8eOHZk4cSJ79uyhbt26bN68md9//50JEyZc8bEtWrQgNTWVhg0b8uKLL9K1a9c890tLSyMtLS37flJSEgAZGRlkZGQ45Dyk+GW9dnoNpTjoepPi5uxrbuXeE5w4m0Z5fy/aR5dz+2vbsu1HPA/+geHpR+YNr0Jh463YGGu1Dngc+hPb6onYr3/JOYEWUKc6YeyKPcNvO4/Tq1Ell8aSRX/npDi50/VWmBgshmG4rB6mYRg8//zzvPXWW1itVmw2G6+//jrPPfdcvo/ZvXs3K1asoFWrVqSlpTFlyhQmTpzIsmXL6NSpU679X375ZcaNG5dr+7fffou/fwmpICQiIuJAk/d68NcJD64Lt3NbTdeX6b4cT1sKN+x4Ft/MBHZG3saeiFuKdJyIxA20+2cC6VZ/FjeagM3q6+BIC+7vRPhghyeBngavtrbhUUKWzxIpjZKTkxkyZAiJiYk5pgHlxaWJ07Rp0xg9ejTvvPMOjRo1YtOmTTzxxBO8//773H333QU+Tp8+fbBYLMyZMyfXz/LqcapatSonTpy44pMj7isjI4MlS5bQrVs3vLxKwNh8KdF0vUlxc+Y1dy4tk/ZvLSMlw84PD7SledVyDj2+o3n8Ohbr6o8wykeT+cDv4OlTtAMZdjwnXoPl1D/Yuo/H3uZ+xwZaCBk2O23HL+NsWiYzRrajaZUQl8WSHZP+zkkxcqfrLSkpiQoVKhQocXLpUL3Ro0czZswY7rjjDgCaNGnCwYMHGT9+fKESp2uuuYapU6fm+TMfHx98fHL/kfXy8nL5CyVXT6+jFCddb1LcnHHN/bb1OCkZdqIrBNA6ugIWixt3d8TtgrVmQQhLr3fw8gu8uuO1fwR+fhrruk+xXjPSXCTXBby84Lo6FViwLZaV+07RKrqCS+LIi/7OSXFyh+utMO27tDhEcnIyHh45Q7BarZctR56XjRs3EhlZQkqpioiIuFBWNb1+zSu7d9JkGLBgNNgzoV4vqNPt6o/ZbDD4lYfTB2D3/Ks/3lVQWXKRkselPU59+vTh9ddfp1q1ajRq1IiNGzfy/vvvM2LEiOx9nnvuOY4ePcrkyZMBmDBhAjVq1KBRo0akp6czdepUZsyYwYwZM1x1GiIiIiXC8aRU/vj7BFACqultnwX7V4CnL/QY75hjegdA6xGw8j1Y9ZFZoc9FVJZcpOQpVOKUmJjIrFmzWLlyJQcOHCA5OZmKFSvSokULbrrpJjp06FCoxj/44AP+/e9/8/DDDxMXF0dUVBQjR47kpZcuVLuJiYnh0KFD2ffT09MZNWoUR48exc/Pj0aNGvHzzz/Tq1evQrUtIiJS1szZdAy7Aa2ql6damBsXSEo7C4teMG93fBLK13DcsdvcD3/8Fw6tgiN/QZVWjjt2IWSVJd8Ve4aVe+Pp29zNE1kRKdhQvZiYGO6//34iIyN55ZVXOHfuHM2bN+eGG26gSpUqLF26lG7dutGwYUO+//77AjceFBTEhAkTOHjwICkpKezbt4/XXnsNb+8Ln7p89dVXLFu2LPv+M888w99//01KSgqnTp1i5cqVSppEREQKYOb5YXpu39u04h04cwzKVYdrH3fssYMjocnt5u1VHzr22IXUpZ7Z66TheiIlQ4F6nJo1a8awYcNYu3YtjRs3znOflJQUZs+ezfvvv8/hw4cZNWqUQwMVERGRotsVm8TOmCS8rBZuburG84JP7DWH0QH0fAu8/BzfRvuHYfO3sOMnSDgE5ao5vo0C6FKvIhOX72P5nnjsdgMP1SUXcWsFSpy2b99OxYoVL7uPn58fgwcPZvDgwcTH65MTERERd5JVFKJrvUqU83fT+TSGAQueAXsG1LkJ6vV0TjsRTSC6M+xfDms+hZted047V9CqenmCfDw5dS6dLUcT3b40vEhZV6CheldKmq52fxEREXEem93gp43HABjQ0o2H6e2cC/t+A6u34wpC5Kf9o+b3DZMhNcm5beXDy+pBxzpmKfJlu+NcEoOIFFyhy5F//fXX/Pzzz9n3n3nmGcqVK0eHDh04ePCgQ4MTERGRq7f6n5PEJqUS7OtJ1/qVXB1O3tKTYdHz5u1rH4ewWs5tr/aNUKEupCXBxinObesyVJZcpOQodOL0xhtv4OdnjjdetWoVH374IW+//TYVKlTgySefdHiAIiIicnWyhun1bhqFj6drFn29opXvQeJhCKkGHZ9yfnseHuaCuACrJ4It0/lt5uHSsuQi4r4KnTgdPnyY2rVrAzB79mxuu+02HnjgAcaPH8/KlSsdHqCIiIgUXUq6jQVbYwA3HqZ3ch/8+V/zdo83wLuYSqU3HQT+YZB4CHbNLZ42L5FVltwwYOVe9TqJuLNCJ06BgYGcPHkSgMWLF3PjjTcC4OvrS0pKimOjExERkauyeEcs59JtVCnvR+vq5V0dTm6GAQueBVs61LoB6t9cfG17+UGb+8zbWZX8XEBlyUVKhkInTt26deO+++7jvvvuY8+ePfTu3RswK+/VqFHD0fGJiIjIVZh90dpNFosblrvevQD+XgIeXtDzbSjuGNvcZxajOLIODq8t3rbPy5rnlFWWXETcU6ETp48++oj27dsTHx/PjBkzCAsLA+Cvv/5i8ODBDg9QREREiib+TBor9p4A3HTR24wUWPisebvDo1ChdvHHEFgJmg40b7toQdxLy5KLiHsq0DpOFytXrhwffpj7D8u4ceMcEpCIiIg4xtzNx7DZDZpVLUfNioGuDie33yeYC9AGV4ZOo10XR/tHYeNUsxz66QNQvkaxNp9VlnzBtliW7Y7Tek4ibqpAPU6HDh0q1EGPHj1apGBERETEcWZvOj9Mr3mUiyPJw6n98Pt/zNs3vQ7eAa6LpVIDc36VYTcr7LmAypKLuL8CJU5t2rTh/vvvZ+3a/Mf+JiYm8tlnn9G4cWNmzpzpsABFRESk8P6OO8uWI4l4eljo08wNE6eFz4EtDaI7Q8N+ro7mQmnyjVMgJaHYm1dZchH3V6Chejt37uSNN96gR48eeHl50bp1a6KiovD19eX06dPs2LGD7du307p1a9555x169uzp7LhFRETkMmZtPAJA57oVCQv0cXE0l9izCPYsAA9P6PVO8ReEyEut66FiA4jfCRsmw7WPFWvzWWXJd8WeYeXeePo2d8M5aSJlXIF6nEJDQ3n33Xc5duwYn3zyCXXr1uXEiRPs3bsXgDvvvJO//vqLP/74Q0mTiIiIi9ntBrM3HgOgv7ut3ZSRapYfB7jmYahYz7XxZLFYLvQ6rfkUbBnFHoLKkou4t0IVh/D19WXAgAEMGDDAWfGIiIjIVVp34BRHE1II8vHkxgbhrg4npz8/gNP7ISgSOj/j6mhyanI7/DoOko7Ajp+gyW3F2nzXehWZuHxfdllyDw836IkTkWyFLkcuIiIi7m3W+bWbejaJwNfL6uJoLpJwCFa+Z97u/hr4BLk2nkt5+ULbB8zbqz40F+ctRi1VllzErSlxEhERKUVSM2z8vDUGgP4tqrg4mkssfA4yU6B6R2h8q6ujyVvrEeDpC8c2wqFVxdp0VllygGW744q1bRG5MiVOIiIipchvu+I4k5pJVIgv7aJDXR3OBX//ArvmgcXqPgUh8hJQAZrdYd5e9VGxN6+y5CLuS4mTiIhIKTJzgzlMr2+Lyu4zRyYzDeafn8/U7kEIb+jaeK7kmofN77t+hpP7irVplSUXcV9KnEREREqJU+fSs4d4DWjhRtX0Vn0Ep/ZBQCXoMsbV0VxZxXpQpztgwOpPirXprLLkhgEr96rXScSdFClxmjJlCtdeey1RUVEcPHgQgAkTJvDTTz85NDgREREpuJ+3HCPTbtAoKpg64W5SeCHxCKx4x7zd/VXwDXZtPAXV/lHz+6ZvIPlUsTbdtb7Kkou4o0InTp988glPPfUUvXr1IiEhAZvNBkC5cuWYMGGCo+MTERGRApp5vppef3fqbVr0AmQkQ7X20HSQq6MpuOhOEN7EjP2vr4q16S51zXlOWWXJRcQ9FDpx+uCDD/jss8944YUXsFovlDht3bo1W7dudWhwIiIiUjAHTpxj46EEPCxwS/MoV4dj2rcUdswGi4d7F4TIy8UL4q79H2QW33wjlSUXcU+FTpz2799PixYtcm338fHh3LlzDglKRERECidr7aaOdSpSKcjXxdFgJhoLzheEaHM/RDRxbTxF0fhWCIyAMzGwfVaxNauy5CLuqdCJU3R0NJs2bcq1fcGCBTRs6OZVckREREohwzCYvclMnNymKMSaT+DEHgioCF2fd3U0RePpDW3vN2+v+qBYF8RVWXIR9+NZ2AeMHj2aRx55hNTUVAzDYO3atXz33XeMHz+ezz//3BkxioiIyGVsOJTAwZPJ+Htb6d4o3NXhQNIxWP62efvGceBXzqXhXJXWI2DlexC7FQ6sNOc+FYNLy5KHBngXS7sikr9CJ07Dhw8nMzOTZ555huTkZIYMGULlypX5v//7P+644w5nxCgiIiKXMWvjEQB6NIrA37vQ/9odb/G/If0sVGkDzQa7Opqr4x8KzYfAus/NsurFlDhllSXfFXuGlXvj6dvcTXoSRcqwIpUjv//++zl48CBxcXHExsZy+PBh7r33XkfHJiIiJZjNbrBq30l+2nSUVftOYlN1MKdIz7Qzb0sMAP1busGb6/0rYduPgAV6vQsepWDJyHYPARbYsxBO7C22ZrPKki/dpXlOIu7gqj6WqlChgqPiEBGRUmThthjGzd1BTGJq9rbIEF/G9mlIj8aRLoys9Fm2O46E5AwqBfnQoZaL/y/bMmD+aPN26xEQ1dyl4ThMhdpQryfsng+rP4ab/1MszXapW5FPlu1jxd4T2O0GHh4lqCqhSClU6I+BTp48ySOPPELDhg2pUKECoaGhOb5ERKRsW7gthoembsiRNAHEJqby0NQNLNwW46LISqesanp9m0dhdfUb67X/g/id4BcK17/o2lgcLas0+abv4NzJYmlSZclF3Euhe5yGDh3Kvn37uPfeewkPD8dSktZkEBERp7LZDcbN3UFeg/IMwAKMm7uDbg0jXP8mvxRITM7g153mMK7+Laq4NpgzsbB0vHn7xpfNuUGlSfVrIbI5xGyC9V9A59FObzKrLPmCbbEs2x1H86rlnN6miOSv0InT77//zu+//06zZs2cEY+IiJRga/efytXTdDEDiElMZe3+U7SvFVZ8gZVS87fFkG6zUz8iiIZRwa4NZslYSD8DUS2hxV2ujcUZLBZo/yjMvM/sWbv2MfD0cXqzXepVPJ84xfPEjXWd3p6I5K/QQ/Xq169PSkqKM2IREZESLu5M/klTUfaTy5u1wRym18/VazcdXAVbpgEW6F1KCkLkpVE/CIqCc3Gw9cdiafLSsuQi4jqF/sv28ccf88ILL7B8+XJOnjxJUlJSji8RESm7KgX5Fmi/KasOsnBbLGmZNidHVHodPpXM2gOnsFjM+U0uY8uE+aPM2y2HQeVWrovF2axe0G6keXvVR8WyIG5EiC8NIoMxDFi5V4vhirhSoYfqlStXjsTERK6//voc2w3DwGKxYLPpn6CISFnVqnp5fD09SM20X3a/9QdPs/7gX5Tz9+LmppEMaFmFFlXLad5sIfy0yext6lArjMgQP9cFsn4SHN8GfuXhhrGui6O4tLrbXNw3bjv8swxqdXV6k13qVWRnTBJLd8VpPScRFyp04nTnnXfi7e3Nt99+q+IQIiKSzTAM/j17W75JU9Z/ixdubkB8UhqzNh4l7kwaU1cfYurqQ0RXCKB/i8r0b1GZqqH+xRd4CWQYBjPPV9Pr58o30mfj4LfXzdvX/xsCysC8Nb/y0PIuWDMRVn1YPImTypKLuIVCJ07btm1j48aN1KtXzxnxiIhICfXmwl18v/4wHha4r2M0c7fE5CgUEXHJOk7P9KjPn/tOMHPDURZui2X/iXO8v2QP7y/ZQ9voUAa0qEyvppEE+3q56pTc1pYjifwTfw5fLw96NnHhuli/vAxpiRDZDFrd47o4ilu7B2HNp/D3LxC3Eyo1cGpzl5YlV3U9EdcodOLUunVrDh8+rMRJRESyfbJsH58u/weA8QOaMKhNNZ7t2YC1+08RdyaVSkG+tI0OzVGC3Oph4bo6FbmuTkVe7ZfJom2xzNx4hD/3nWTt/lOs3X+KsXO2061hOANaVua6OhXxspbSogOFlLV2U/eGEQT6XNVa9kV3eC1s+sa83es98LC6Jg5XCI2GBjfDzrnmgri3fODU5lSWXMQ9FPqv7b/+9S8ef/xxRo8eTZMmTfDyyvlJYNOmTR0WnIiIuL9v1xzirYW7AHi+V30GtakGmIlRQUuOB/p4cmurKtzaqgoxiSnM3niMmRuOsDfuLPO2xDBvSwwVAr25pVllBrSsTKOo4DI7VDzDZmfu5mMA9HdVNT277UJBiOZDoWob18ThSu0fNROnzd/D9S9BYEWnNqey5CKuV+jEadCgQQCMGDEie5vFYlFxCBGRMmjelmO8MHsrAA91qcUDnWpd9TEjQ/x4qEstHuxck21Hk5i58QhzNh3jxNl0vvhjP1/8sZ+64YEMaFmFfs0rExFSsEp+pcXKvfGcPJdOhUBvrqtTwTVB/PUlxGwG3xBzsduyqGo7s4Lg0b/MAhldxji1uUvLkocGeDu1PRHJrdCJ0/79+50Rh4iIlDDL98Tz5PebMAwY3LYaz9zk2CHcFouFJlVCaFIlhOd7NWDFnnhmbjzKkh3H2XP8LG8u2MVbC3dxba0KDGhZmZsaRRDgqmFrxWjWRrO3qU+zKDxdMXTx3En49VXzdtcXnd7T4rYsFmj/CPw4AtZ+Btc+Dl7Oq26YVZZ8Z0wSK/bEu37tLpEyqND/YapXr+6MOEREpAT56+BpHpzyFxk2g95NI3mtX2OnDp3zsnpwQ4NwbmgQTmJKBvO3xjBrw1HWHjjF73+f4Pe/T+DvvY0ejSIY0LIK7WuF5ZhPVVqcSc1g8fZYwIXD9H59GVITILwJtB5xpb1LtwZ9IaQqJB6GLdPNUuVOlFWWfNnuOCVOIi5QoMRpzpw59OzZEy8vL+bMmXPZfW+55RaHBCYiIu5pZ0wSw79cS0qGjevqVOA/A5sXa5IS4ufF4LbVGNy2GodOJjNr41FmbTzCgZPJzNx4lJkbjxIR7EvfFlHc2rIKdcODii02Z1uwLZa0TDu1KgbQpHJI8Qdw5C/YMMW83ftdsJb+Hr7LsnqaFfYWv2AuiNtymNkT5SQqSy7iWgX6i9evXz9iY2OpVKkS/fr1y3c/zXESESndDp48x7Av1pKUmknLauX49K5WeHu6rtJdtTB/Hr+xDo/dUJsNhxKYueEI87bEEJuUyqfL/+HT5f/QuHIw/VtU4ZZmUVQM8nFZrI4w+3w1vQEtqxR/cQy7HeY/DRjQbDBUu6Z423dXLe+CZW/Cid3w969Q50bnNaWy5CIuVaD/dna7ndTUVAzDwG635/ulpElEpPQ6npTK0ElriD+TRv2IIL68py3+3u7R42CxWGhVvTyv92/C2hduYOLQlnRrGI6X1cK2o0m8Om8H14z/lRFfrWPu5mOkZpS8/1cxiSms+uckALc0iyr+ADZOhmMbwScYbhxX/O27K98Qs6cJzAVxnSirLDnAst1xTm1LRHIr8MeE0dHRxMfHOzMWERFxUwnJ6QybtJbDp1KoFurP5BFtCfF3z4VpfTyt9GgcyWfDWrPm+Rt5pW8jmlUth81u8NuuOP713UbavPYLY2ZsYc0/J7HbDVeHXCCzNx7DMKBtdChVQ/2Lt/HkU/DL+WSpy3MQFF687bu7diPB4gH/LIXYbU5tqks9sxjHst16TyZS3AqcOBlGyfjHIiIijpWcnsnwr9ax+/gZKgX5MPXedlQKLhklwEMDvBnWvgY/PXItvz7dmUe71qZyOT/OpGUybd1hBv1vNZ3eWcp7i3fzT/xZV4ebL8MwmLXxCAADXFEU4LdXIeUUVGoIbR8o/vbdXfnq0LCveXv1x05tqku9nGXJRaT4aAl2ERHJV1qmjZFT/mLjoQRC/LyYcm87qoUVc2+Hg9SqGMiom+qx8pmuTHvgGga2rkKgjydHTqfwwW9/c/17y+n/8R9MWX2QhGT3ekO6IyaJPcfP4u3pQc8mkcXb+LGNsP5L83YvFYTIV/tHze9bpsOZWKc1Ex5sliU3DFixR71OIsWpUH/9Pv/8cwIDAy+7z2OPPXZVAYmIiHuw2Q2e/H4TK/eewM/LypfD21AvouRXqPPwsHBNzTCuqRnGuFsas3hHLLM2HmXFnng2Hkpg46EEXpm7nevrV2JAyyp0rVfJpQUwAGZtMItC3NigEiF+xThE0m6Hn0cBBjS5HWpcW3xtlzRVWpuL4h5eA+s+h+tfdFpTKksu4hqFSpwmTpyI1WrN9+cWi0WJk4hIKWAYBi/M2sr8rbF4WS38b1grWlYr7+qwHM7P20rf5pXp27wycWdSmbPpGDM3HGVHTBKLth9n0fbjlPP3ok/TKAa0rEzzquWKvZqdzW7w02Zz0dv+LaoUa9ts/haOrgfvQOj2avG2XRK1f+R84jQJOj4F3s7pnVVZchHXKFTitH79eipVquSsWERExE28tXA309YdxsMC/3dHC66rU9HVITldpSBf7ruuJvddV5NdsUnM2nCUWRuPEncmjSmrDzJl9UFqVgigf4vK9GtRudgKNPzx9wniz6RR3t+LznWL8XVIOQ1Lxpq3Oz8LwcU8RLAkqn8zlKsOCQdhyzSnLRCssuQirlHgsQfFvl6EiIi4xCfL9jFx+T4A3ujfhF7FPafGDdSPCOa5Xg1Y9dwNTB7Rln7No/DzsvLPiXO8t2QP1729lIGfruL7dYdISs1waiyzzq/ddHPTqOIdMrj0DUg+ARXqwTUPFV+7JZmHFa552Ly96iNzqKMTqCy5iGuoqp6IiGT7bu0h3lq4C4DnetbnjrbVXByRa1k9LHSqW5EJd7Rg3Ys38u7tzbi2dhgWC6zdf4pnZ2ylzWu/8Oi3G1i6K45Mm2PfKJ9Ly2ThNrPQQP+WxTiXJWaLOU8HoNc7YHXP0vNuqcWd4BMCJ/+GvYud1ozKkosUvwIP1Rs7duwVC0OIiEjJ9fOWGJ6ftRWABzvXYmTnWi6OyL0E+nhyW6sq3NaqCscSUpi96SgzNxzl77izzNsSw7wtMVQI9OGWZuZ8qEZRwVc9WuOXnXGkZNioEeZPi+IajmUYMH80GHZo1B9qdi6edksLnyBodTf8+V9zQdx6PZzSzMVlyU+eTSMs0Mcp7YjIBQXucRo7diz+/iWzBK2IiFzeij3xPPH9RgwDBretyrM96rk6JLcWVc6Ph7vUZsmTnZj7aEfu6VCD0ABvTpxN44s/9nPzB7/TY8JKJi7fR2xiapHbmb05BoB+LSoX35D5zdPg8GrwCoDurxdPm6VNu5FgscKBlRCz2SlNXFyWfOXeE05pQ0Ry0jpOIiJl3F8HTzNyyl9k2Ax6N4nktX5NNK+1gCwWC02qhPDyLY1Y8/wNTLq7Nb2bROLt6cHu42d4c8Eu2r/5K3dNWsPMDUc4l5ZZ4GMnpsOf+04C0L+4Sk6nJsKSl8zbnUdDiEpdF0lIFbO3DmCV8xbEvTBcT/OcRIqDVrETESnDdsUmMeKrdaRk2LiuTgX+M6g5VpU2LhIvqwc3NAjnhgbhJKZkMH9rDDM3HGHdgdOs3HuClXtP4O+9jR6NIxjQogrta4Xl+Vzb7AZr9p9izkEP7Aa0qBpC9bCA4jmJZW/CuTgIqw3XPFI8bZZW7R+BbT+aXzeOheAohzehsuQixUuJk4hIGXXoZDJ3TVpLYkoGLauV49O7Wrl8odfSIsTPi8FtqzG4bTUOnUxm1sajzNx4hIMnk5m5wZwbFRHsS78WlRnQsjJ1w82FhRdui2Hc3B3EJKaSNShkX/w5Fm6LoUdjJ1c3PL4d1nxq3u75Nnh6O7e90q5yS6h+LRz8A9b+D2582eFNqCy5SPEq0n/IzMxMfvnlFz799FPOnDkDwLFjxzh79qxDgxMREeeIS0pl6KQ1xJ9Jo35EEF/e0xZ/b32W5gzVwvx5/MY6LBvVhRkPdeDOdtUI9vUkNimVicv30f0/K7j5g5WM/mEzD07dcD5puuBMaiYPTd3Awm0xzgsyuyCEDRr0gdo3OK+tsqT9+V679V9AmuPfI6ksuZREWb3qf52wsGb/KWz2klO5u9D/JQ8ePEiPHj04dOgQaWlpdOvWjaCgIN5++21SU1OZOHGiM+IUEREHSUhO565Jazl0Kplqof5MHtGWEH+Vm3Y2i8VCq+rlaVW9PC/1achvO+OYufEoS3fFse1oEtuOJuX5OAOwAOPm7qBbwwjnDKXc+qPZM+LpBzeNd/zxy6q6PSC0Jpz6BzZ/B23vd3gTXetVYsG2WJbtjueJG+s6/PgijpSzV93K5L3riQzxZWyfhs7vVXeAQvc4Pf7447Ru3ZrTp0/j5+eXvb1///78+uuvDg1OREQcKzk9k+FfrWP38TNUCvJh6r3tqBTs6+qwyhwfTys9m0Ty2bDWrH3hRu7pUOOy+xtATGIqa/efcnwwqUmw+EXzdqenoVxVx7dRVl28IO7qj8Fuc3gTnc8XiMgqSy7irhZui+GhPHrVYxNTnd+r7iCFTpx+//13XnzxRby9c459rl69OkePHnVYYCIi4lhpmTZGTvmLjYcSCPHzYsq97agWpmUmXC00wJsW1coVaN+4M0UvbZ6v5W/B2VizZ6TDY44/flnXfAj4ljN7nXYvcPjhVZZcSgKb3WDc3B3kNSgva9u4uTvcftheoRMnu92OzZb7E5MjR44QFBTkkKBERMSxbHaDp77fzMq9J/DzsvLFPW2oF6G/2e6iUlDBev0Kul+Bxe2CNeeH2Pd4Czy1iKrDeQdA6xHm7VUfOaUJlSUXd7d2/6lcPU0Xc2qvugMVOnHq1q0bEyZMyL5vsVg4e/YsY8eOpVevXo6MTUREHMAwDF6cvZWft8bgZbXw6V2taFW9vKvDkou0jQ4lMsSX/GYvWYDIEF/aRoc6rlHDgAWjwZ4J9XpB3e6OO7bk1PYB8PCCQ3/C0b8cfvgudc3EKassuYi7KWhvuVN61R2o0InTf/7zH5YvX07Dhg1JTU1lyJAh1KhRg6NHj/LWW285I0YREbkKby3czXdrD+Nhgf+7owWdzr/JEvdh9bAwtk9DgFzJU9b9sX0aOrYwxPZZsH8FePpCDxWEcKrgSGh8q3nbCQviXlqWXMTdeFkLlnI4vFfdwQqdOEVFRbFp0yZGjRrFyJEjadGiBW+++SYbN26kUqVKzohRRESKaOLyfUxcvg+A1/s3oVcT969aVFb1aBzJJ0NbEhGS841DRIgvnwxt6diKU2lnYdEL5u2OT0L5Go47tuSt/fkiEdtnQeIRhx5aZcnFnc3fGsNzM7dcdh+n9Ko7QZEW7fDz82PEiBGMGDHC0fGIiIiDfLf2EG8u2AXAmJ71Gdy2mosjkivp0TiSbg0jWPV3HItXrqH7de1oX7uS40uQr3gHzhyDctXh2scde2zJW2QzqHEdHFhpzivr/ppDD59VlnypypKLm0hKzeDln7Yzc6NZPK5qqB+HT6VggRxFIpzWq+4EhU6c5syZk+d2i8WCr68vtWvXJjo6+qoDExGRopu/NYYXZm0F4MHOtXiwcy0XRyQFZfWw0C46lJM7DdpFhzr+jcSJvReKFPR8C7z8Lr+/OE6Hf5mJ019fQ+dnwcdxBVqyypJvOV+WPCxQhT7EdVbtO8moHzZzNCEFDws80rU2/7q+Dr/tOn7ROk6miBK0jlOhE6d+/fphsVgwjJyTD7O2WSwWOnbsyOzZsylfXpOPRUSK28q98Tw+bSN2Awa3rcqzPeq5OiRxF4YB80eDPQPqdDcXaJXiU7sbhNWBk3th41S45iGHHTqrLPnOmCRW7j1BvxaVHXZskYJKzbDx3uLdfP77fgwDqof58/7A5tkFiYqtV91JCj3HacmSJbRp04YlS5aQmJhIYmIiS5YsoW3btsybN48VK1Zw8uRJRo0a5Yx4RUTkMv46eJoHJv9Fhs2gd5NIXuvXBIulZPxDkmKwcy78sxSs3tDjTdC1Ubw8PC7MdXLCgrgqSy6utDMmiX4f/cFnK82kaXDbasx/7LpcVVyzetVbVXBSr7oTFbrH6fHHH+d///sfHTp0yN52ww034OvrywMPPMD27duZMGGC5j+JiBSz3bFnGPHVOlIybFxXpwLvD2pWov4hiZOlJ8Oi583b1z4OYRq+6RJN74BfX4WEQ7BrHjTs67BDd6lbkU+W7csuS+6h338pBja7wecr/+G9xXtIt9mpEOjNmwOacmPDcFeH5nCF7nHat28fwcHBubYHBwfzzz//AFCnTh1OnNDq1SIixeXQyWTumrSGxJQMWlQrx6d3tcLH0+rqsMSdrHwPEg9DSDXo+JSroym7vP2hzb3mbQcviKuy5FLcDp9KZvBnqxm/YBfpNjs3Nghn4ROdSmXSBEVInFq1asXo0aOJj4/P3hYfH88zzzxDmzZtANi7dy9VqlRxXJQiIpKvuKRUhk5aQ9yZNOqFB/HlPW3w9y5S0VQprU7ugz//a97u8Yb55l1cp8395nDJw2vg8DqHHdbL6sF1dVWWXJzPMAx+/OsIPf9vJWv3nyLA28rbtzbls2GtqFCKC5MUOnGaNGkS+/fvp0qVKtSuXZs6depQpUoVDhw4wOeffw7A2bNn+fe//33FY2VmZvLiiy8SHR2Nn58fNWvW5JVXXsFut1/2ccuXL6dVq1b4+vpSs2ZNJk6cWNjTEBEpFRKTM7hr0loOnUqmWqg/U+5tSzl/b1eHJe7EMGDBs2BLh1o3QP2bXR2RBIVDk4Hm7VUfOvTQXeqaa2ou3R1/hT3lqtltsH8lbP3R/O7gOWvu6tS5dB6auoFRP2zmbFomrauXZ8HjnRjYpmqpn1Nb6I8k69Wrx86dO1m0aBF79uzBMAzq169Pt27d8PAw87B+/foV6FhvvfUWEydO5Ouvv6ZRo0asX7+e4cOHExISwuOP572uxP79++nVqxf3338/U6dO5Y8//uDhhx+mYsWK3HrrrYU9HRGREis5PZPhX61l9/EzVAzyYeq97agU7N6rrosL7F4Afy8BDy/o+bYKQriL9g/Dpqmwcw6cPgjlqzvksCpLXkx2zIGFz0LSsQvbgqOgx1vQ8BbXxeVkS3fFMfrHLZw4m4anh4Unu9Xlwc61ysx82iKN5bBYLPTo0YMePa6ujOmqVavo27cvvXv3BqBGjRp89913rF+/Pt/HTJw4kWrVqjFhwgQAGjRowPr163n33XeVOIlImZGWaWPklL/YcCiBYF9PptzblmphGn4ll8hIMd/cAXR4FCrUdm08ckF4I6jZ1axyuOZTcwilIw6rsuTOt2MOTB9GzmVcgaQYc/vAyaUueUpOz+T1n3fyzZpDANSpFMh/BjWnceUQF0dWvIqUOJ07d47ly5dz6NAh0tPTc/zsscceK/BxOnbsyMSJE9mzZw9169Zl8+bN/P7779lJUV5WrVpF9+7dc2y76aabmDRpEhkZGXh5eeX4WVpaGmlpadn3k5KSAMjIyCAjI6PAsYp7yXrt9BpKcXC3681mN3hy+hZW7j2Bn5cHn9/Vklphfm4Tn1w9R11zHivew5pwCCMoisz2T4CuEbdiafsgnv8sxdjwNZnXPg2+uYtvFUWn2mHsjEnit53H6d24UoEe425/59yW3YbngmcBg9x9LIa5deEYMmt1B4/SUaBn0+EERs/YxoGTyQDc074aT3erg6+XtcjXiztdb4WJodCJ08aNG+nVqxfJycmcO3eO0NBQTpw4gb+/P5UqVSpU4vTss8+SmJhI/fr1sVqt2Gw2Xn/9dQYPHpzvY2JjYwkPz1mpIzw8nMzMTE6cOEFkZM5Vh8ePH8+4ceNyHWfx4sX4++vT2ZJuyZIlrg5ByhB3uN4MA6b/48GfcR5YLQb31M4gZtufxGxzdWTiDFdzzfmnxXH9zgkArA8bwLFfljsoKnEYw6Crb2WCU4+y+7sX2Bfe0yGH9U4C8OTXHceY9/NhCjOKyh3+zrmzsDM76XjmWL4/t2BA0lHW/DCBk0ENijEyx7PZYdFRD5YcsWDHQjlvgyG17dTjH35b8o9D2nCH6y05ObnA+xY6cXryySfp06cPn3zyCeXKlWP16tV4eXkxdOjQfOcl5ef7779n6tSpfPvttzRq1IhNmzbxxBNPEBUVxd13353v4y6deGYYRp7bAZ577jmeeupC2dWkpCSqVq1K9+7d8yyrLiVDRkYGS5YsoVu3brl6GUUczZ2ut3cX7+XPuP1YLPCfgc3o2TjCpfGIczjimrNOvxMPIwN7jU40HzKW5prb5JYslU/Dz0/Q6OwK6t39H/C4+oqYGTY7X/69jLNpmVRtdi3Nqlx5OJU7/Z1zZ5btKfD3lfe7pnENjEa9nB+Qk/wTf47RM7ay5ag5UqtP0wjG3tyAED/HXBvudL1ljUYriEL/dm7atIlPP/0Uq9WK1WolLS2NmjVr8vbbb3P33XczYMCAAh9r9OjRjBkzhjvuuAOAJk2acPDgQcaPH59v4hQREUFsbGyObXFxcXh6ehIWFpZrfx8fH3x8ck+M9PLycvkLJVdPr6MUJ1dfb58u38enK/cD8Eb/JtzSoqrLYpHiUeRrbs8i2LsIPDzx6P0uHt6qtOi2mg+Gpa9hSTqK19750Pjq52t7eUGnuhWYvzWWlX+fonV0hUI8Vv9XLyukYHPGPEMqmy9ECWMYBlNXH+T1+TtJzbAT7OvJa/2bcEuzKKe05w7XW2HaL3Q5ci8vr+yenfDwcA4dMieJhYSEZN8uqOTk5OxKfFmsVutly5G3b98+V7fe4sWLad26tcufeBERZ5m29hDjF+wC4Nke9RnctpqLIxK3lZFqlh8HuOYhqFjPtfHI5Xn5Qtv7zdt/fmiOx3WArLLky/aoLLlDJR6+8j5BkVC9g/NjcbDjSanc/eU6/v3TdlIz7HSsXYFFT3ZyWtJUEhU6cWrRokV21buuXbvy0ksv8c033/DEE0/QpEmTQh2rT58+vP766/z8888cOHCAWbNm8f7779O/f//sfZ577jmGDRuWff/BBx/k4MGDPPXUU+zcuZMvvviCSZMmMWrUqMKeiohIiTB/awzPz9oKwMjONXmoSy0XRyRu7c8P4PR+881b52ddHY0UROt7weoDxzaYi+I6wKVlycUB1n0Osx+6aEM+w1+9AyGj4PNm3MH8rTHcNGEFK/bE4+Ppwct9GjJ5RFsiQ/xcHZpbKXTi9MYbb2QXYHj11VcJCwvjoYceIi4ujv/973+FOtYHH3zAbbfdxsMPP0yDBg0YNWoUI0eO5NVXX83eJyYmJkdPVnR0NPPnz2fZsmU0b96cV199lf/+978qRS4ipdLKvfE8Pm0jdgPuaFOVMT3quzokcWcJh2Dle+bt7q+BT5Br45GCCawIzcxpC/z5gUMOmVWW3DBg5d4TDjlmmfbH/8HPT5u32z4At0+G4JwFyQioBF7+cHIvfHsHpLt/8pSUmsFT32/i4W82kJCcQePKwcz7V0fuuTYajzKyNlNhFGqOk2EYVKxYkUaNGgFQsWJF5s+fX+TGg4KCmDBhwmXLj3/11Ve5tnXu3JkNGzYUuV0RkZJgw6HTjJzyFxk2g15NIni9f5NSvyq7XKWFz0FmClTv6JC5MlKMrnkYNnwNu36GU/9AaM2rPmSXehXZGZPEst1xWs+pqAwDlr4OK94x73d8Cm54yVxIusHNcPBPOHscAsPN4Xkxm+DrvnDwd5g2BAZPM4djuqFV+04y6ofNHE1IwcMCD3epzWM31MHbs9D9KmVGoZ4ZwzCoU6cOR44ccVY8IiIC7I49w/Av15GcbuO6OhX4z6DmZWZldimiv3+BXfPAYoVe75hv7KTkqFQfancDDFg90SGH7FLXHK63Yu8J7HbHzJ0qUwzD/DAiK2m64SW4ceyF3y0PK0RfB01uM797WKFyKxg6A7wCzMWNpw+DzPT823CB1Awbr/+8gyGfr+ZoQgrVQv354cH2jLqpnpKmKyjUs+Ph4UGdOnU4efKks+IRESnzDp1M5q5Ja0hMyaBFtXJMHNoKH8/SsZCiOElmGsx/xrzd7kEIb+jaeKRo2j9ift84FVJOX/XhWlYvT5CvJ6fOpbPlaOJVH69Msdtgzr9gzSfm/Z7vwHVPF+yx1drBndPB08+sbvnjcLC5fqFXgJ0xSfT76A8+W7kf4/wQ8PmPX0er6qGuDq1EKHRa+fbbbzN69Gi2bdNqiyIijhaXlMrQSWuIO5NGvfAgvrynDQE+V7+ui5Ryqz6CU/vMORZdVBCixKrZBcIbQ8Y5+Ovrqz6cl9WD6+qYpciX7oq76uOVGbYMmHEfbJwCFg/o+zG0e6Bwx6jREQZ/axb92DUPZj5gJmMuYrMbfLp8H30//INdsWcIC/Dms2GtefPWpgTqf0yBFTpxGjp0KGvXrqVZs2b4+fkRGhqa40tERIomMTmDYV+s5dCpZKqG+jH53raU89f6O3IFiUcuDCXq/ir4XnmxU3FTFsuFXqc1nzpkiJfKkhdSRip8PxS2zwQPL7jtS2hxZ9GOVet6GDTFPM72mfDTI3CZJXec5fCpZAZ/tprxC3aRbrNzY4NwFj3ZiW4Nw4s9lpKu0Cnm5Qo5iIhI0SSnZzL8q7Xsij1DxSAfpt7bjvBg95xQLG5m0Qtm6eNq7aHpIFdHI1er8a3wy8tw5hjsmA1NB17V4S4tSx4W6HP1MZZWaWdh2mDYvwI8fWHgFKjb/eqOWfcmuP1LmH43bP4OrF5w8/+Bh/PnEhmGwYwNR3l5znbOpmXi721lbJ+GDGxdVYWGiqjQidPdd9/tjDhERMqs9Ew7D07dwIZDCQT7ejLl3rZUDwtwdVhSEuxbar65tnioIERp4eljLoj722uw6kNocvtVva5ZZcl3xiSxcu8JVdfLT8pp+OZ2OLLOXIdp8DSz4IMjNOgDt35mDv/bMNkcvufk39dT59J5YdZWFmyLBaBV9fK8P7CZ/rdcpSKlu/v27ePFF19k8ODBxMWZY2YXLlzI9u3bHRqciEhpZ7MbPDl9Eyv2xOPnZeXL4W2oHxHs6rCkJMhMhwXnC0K0uR8iCrcIvbixViPMwgIxm+HgH1d9uC7ne52W7dY8pzydjYev+5hJk285GPaT45KmLI1vNedKYYF1n8HiF82qfU6wdHccN01YwYJtsXh6WBh9Uz2mj2yvpMkBCp04LV++nCZNmrBmzRpmzpzJ2bNnAdiyZQtjx451eIAiIqWVYRi8OHsbP2+JwctqYeJdrVTZSApuzSdwYg/4V4Cuz7s6GnGkgDBoPti8veqjqz7cxWXJbSpLnlPiUfiqF8RuhYCKcM/PUKW1c9pqPhj6TDBvr/rQ7FV0oOT0TF6cvZXhX64j/kwatSsFMvuRa3mka20tZ+EghU6cxowZw2uvvcaSJUvw9r4wablr166sWrXKocGJiJRm7yzazXdrD2GxwH8GNafz+Tc3IleUdAyWv23e7jYO/Mq5NBxxgmseNr/vXgAn/r6qQ+UoS34k4epjKy1O7Ycve5gfQARXhuELIaKxc9tsdY9Z2hxg5bsXfo+v0sZDp+n939+ZuvoQAMOvrcG8f3WkcWUVi3GkQidOW7dupX///rm2V6xYUes7iYgU0KfL9/Hxsn0AvN6vCTc3jXJxRFKiLP43pJ+FKm2g2RBXRyPOUKEO1O2JuSDux1d1qIvLki/brep6AMTvhi97QsIhKB8NwxdAhdrF03a7B6D7+d6mpa/D7xOKfKgMm53/LNnDbRNXsf/EOSKCfZl6bzvG9mmEr5fW/3O0QidO5cqVIyYmJtf2jRs3UrmyJhyKiFzJ9+sOMX7BLgCe7VGfIe2quTgiKVH2r4RtPwIW6PVusVTnEhfJKk2+6VtIPnVVh1JZ8ovEbDaTpjMxULEBjFgI5asXbwwd/gXX/9u8/ctYWD2x0IfYF3+W2z75k//7dS82u8EtzaJY9EQnOp5PksXxCv3XdsiQITz77LPExsZisViw2+388ccfjBo1imHDhjkjRhGRUmPB1hiem7kVgJGdavJQl1oujkhKFFsGzB9t3m49AqKauzQccbIaHSGiKWSmwPovrupQl5YlL7MOrYGv+kDySYhsbs5pCopwTSydRkGn8wVeFj5b4NfYMAymrDpA7/+uZPORRIJ9Pfm/O5rz38EtCPH3cmLAUujE6fXXX6datWpUrlyZs2fP0rBhQzp16kSHDh148cUXnRGjiEip8PveEzw+bRN2Awa1rsqYnvVdHZKUNGv/B/E7wS8Urtf/3FLPYoH2j5q31/4PMoue8GSVJTcMWLn3hIMCLGH2LYUp/SAt0Vz37O45ZiEOV+r6PHR4zLw970nYOPWyu8clpXLPl+v490/bSc2wc23tMBY92Ym+zTXqqzgUOnHy8vLim2++Yc+ePUyfPp2pU6eya9cupkyZgtWqsZQiInnZeOg0D0xZT7rNTq8mEbwxoIkWIJTCORMLS8ebt298GfxVgbFMaNQfgiLh7HHYNvOqDlWmy5Lvmg/fDjQXi651PQydAb5uUDjBYoFur0C7B837Pz0KW37Ic9cFW2PoPmEFy/fE4+Ppwdg+DZkyoh2RIX7FGHDZVugFcJcvX07nzp2pVasWtWppiImIyJXsjj3DPV+uIzndxnV1KvCfQc1VGlYKb8lYSD8DUS2hxV2ujkaKi6c3tBsJv7xslrBudkeRF07tWq8Snyzbl12WvMz8Hdr6I8x8AAwb1L8ZbvvCXGjYXVgs0ONNs0fxry9h1kjzdW/YF4Ck1AxenrOdmRuOAtAoKpgJg5pTJzzIlVGXSYXucerWrRvVqlVjzJgxbNu2zRkxiYiUGodPJXPXpDUkpmTQvGo5Jg5thY+neuelkA6ugi3TAAv0VkGIMqfVPeDlD8e3wf7lRT5My2rlyl5Z8r++hhn3mUlT00Fw+9fulTRlsVig9/vQ/E4z1h9HwO4FrP7nJD0nrGTmhqN4WOCRrrWY9fC1SppcpNB/eY8dO8YzzzzDypUradq0KU2bNuXtt9/myJEjzohPRKTEijuTytBJa4g7k0a98CC+Gt6GAJ9Cd/RLWWfLhPmjzNsth0HlVq6NR4qfX3loMdS8fRUL4nqWtbLkqz6CuY8BhllMpd9EsLrx32APD7jlA2h8G9gzyZx2F598/ilHE1KoFurP9JHtGX1Tfbw99cGJqxT6ma9QoQKPPvoof/zxB/v27WPQoEFMnjyZGjVqcP311zsjRhGREicxOYNhk9Zy8GQyVUP9mHxvW8r5e1/5gSKXWj/J7GnwKw83jHV1NOIq7R4ELLB3sbkGURGVibLkhgHL3oJFz5v3O/zL7M0pCT21HlZ2tn+HlV4d8DQy+NTrfV5oEMf8x6+jdQ3Na3S1q7qCoqOjGTNmDG+++SZNmjRh+fKidx+LiJQWyemZjPh6Hbtiz1AxyIep97YjPNjX1WFJSXQ2Dn573bx9/b9dXwFMXCesFtTvbd6+igVxS31ZcsOAJf+GZW+Y97u+AN1eLfK8sOJksxt8unwffT9ew4gzD7Kc1vhaMrj/yPMExq5zdXjCVSROf/zxBw8//DCRkZEMGTKERo0aMW/ePEfGJiJS4qRn2nlo6gb+OniaYF9PJo9oS/WwAFeHJSWJ3Ybl4O9UPrUK69xHzdLJkc3MeS5StmWVJt88Dc4VraR4qS5LbrebJb3//MC8f9N46PxMiUiaDp9KZvBnqxm/YBfpNjudG0TR8PFZUOsGsxLgN7fDkfWuDrPMK3Ti9PzzzxMdHc3111/PwYMHmTBhArGxsUydOpWePXs6I0YRkRLBZjd4avomlu+Jx9fLgy+Ht6FBZLCrw5KSZMccmNAYz6n9aH3wEzz++c3c3rAveKioSJlX7RqzqmJmKqybVOTDlMqy5LZMsxrdX18CFujzX2j/sKujuiLDMJjx1xF6/t9K1u4/hb+3lTcHNOGzYa2pWD4Y7vgGalxnVtScMgCObXJ1yGVaoROnZcuWMWrUKI4ePcrPP//MkCFD8Pf3B2DTpk2Ojk9EpEQwDIN//7SNeVti8LJa+PSu1rSqrvHoUgg75sD0YZB0LPfPfn3V/LmUbRYLtH/EvL3uM8hILdJhutYz5zkt3xOPzW44KjrXyUyDH+6GrdPBwxNu/Rxa3e3qqK7o1Ll0Hv5mA0//sJmzaZm0rFaOBY9fxx1tq11Y58/LD4Z8by7Ym5ZoLuB7fLtL4y7LCp04/fnnnzzyyCNUqGBWZUlMTOTjjz+mZcuWtGqlSj8iUja9u3g33645hMUC/xnUnM51K7o6JClJ7DZY+CxwmTexC8eY+0nZ1rAvBFeBc/GwNe+FUq8kqyz56eSMkl+WPP0cfDsIds0Dqw8MmgpNbnN1VFe0dHccN01YwYJtsXh6WBh9Uz2mj2yf99Bu7wAYMt2sqJlyGr6+5aoKhEjRFXmO02+//cbQoUOJjIzkgw8+oFevXqxfr7GXIlL2/G/FPj5aug+A1/s14eamUS6OSEqcg3/m3dOUzYCko+Z+UrZZvcwFccEst20Uvseo1JQlT02EqbfCP0vNda7unA713HvaSHJ6Ji/O3srwL9cRfyaN2pUCmf3ItTzStTae1su8LfcNhqEzIKIpJJ8wk6eT+4ovcAEKmTgdOXKE1157jZo1azJ48GDKly9PRkYGM2bM4LXXXqNFixbOilNExC1NX3eYN+bvAuCZHvUY0q6aiyOSEunsccfuJ6Vbq7vBOxDid8K+X4t0iBJflvzcSTN5OLQKfELgrtlQs4uro7qsjYdO0/u/vzN19SEAhl9bg3n/6kjjyiEFO4BfeRj2E1RqBGdj4es+cPqA8wKWXAqcOPXq1YuGDRuyY8cOPvjgA44dO8YHH3zgzNhE8mSzG6zZf4q/TlhYs/9U6RifLSXSwm0xjJm5BYCRnWryUOdaLo5ISqzEAi4iHxju3DikZPANMRdDhiIviFuiy5KfiYWvekHMJvAPg3vmQrV2ro4qXxk2O/9ZsofbJq5i/4lzRAT7MvXedozt0whfr0IWffEPhWGzoUJdsxf66z4F//shV63AyycvXryYxx57jIceeog6deo4MyaRfC3cFsO4uTuISUwFrEzeu57IEF/G9mlIj8aRrg5PypDf957gse82YTdgUOuqjOlZ/8JkXpGCykiFX16GNZ9cYUcLBEdB9Q7FEZWUBO1GwpqJsO83OL4DwhsW6uFZZcl3xiSxcu8Jejeu5KRAHez0QZjcF07vh6BIswemYj1XR5Wvf+LP8uT3m9h8JBGAPs2ieK1vY0L8vYp+0MBKMGyOmTye+sdMnoYvgKAIB0Ut+Slwj9PKlSs5c+YMrVu3pl27dnz44YfEx5fQ7l0pkRZui+GhqRvOJ00XxCam8tDUDSzcFuOiyKSs2XjoNA9MWU+6zU7PxhG8MaCJkiYpvOPb4bOuF5Km2t0Ay/mvi52/3+NNlSSXC8rXgAZ9zNuri9br1LWklSU/sRe+7GkmTeWqm8mCmyZNhmEwZfVBev13JZuPJBLs68n/3dGcDwa3uLqkKUtwJNw9F8pVO5883QJn9b7c2QqcOLVv357PPvuMmJgYRo4cybRp06hcuTJ2u50lS5Zw5swZZ8YpZZzNbjBu7o48601lbRs3d4eG7YnDXTo0dGdMEsO/Wkdyuo2OtSsw4Y7mWD2UNEkhGAasngj/6wpxOyCgIgz5AYb+CAMnm2+ILhYcZW5veItr4hX3lbUg7pbpcLbwyU+XklSWPHarmTQlHTWHqY1YCKHRLg3JZjdYte8kP206yqp9J7Ofw7ikVIZ/tY5/z95GaoadDrXCWPhEJ/o2r+zYAEKqmMlTcGU4sdvsiUs+5dg2JIcCD9XL4u/vz4gRIxgxYgS7d+9m0qRJvPnmm4wZM4Zu3boxZ47WmRDHW7v/VK6eposZQExiKj+sP0yPxhGE+HmpB0CuWl5DQz0sYDegedVyfHpXK3w81QMghXDmOPz0MPz9i3m/Tnfo+5E59AbM5Kh+bzL/WcGmlYtoft1NeNbspJ4myVvVtlClLRxZC+s+h67PF+rhF5cl33o00UlBOsDhdfDNrWYVvYgmZiGIgAouDSnn/wdTZIgvfZpG8sNfRzidnIG3pwfP9qjP8A418HDWB2zla5jJ05c9IW67uc7TsDngV8457ZVxhU6cLlavXj3efvttxo8fz9y5c/niiy8cFZdIDnFnCrbI35iZWxkzcyt+XlYiy/kSFeJHZIgvkeX8iLrke6DPVV3+UsplDQ299DPYrA9lh7arRoCuISmM3Qvhp0fMUsKevtD9NWhzn7mo6cU8rBjVO3J0exLNqndU0iSX1/4R+OF84tTxSXPB1ALKKks+f2ss3649TNA5C2H7T9G+diX36UnfvwK+vQMyzplJ4p0/uDwpyO//Q0xiKv9buR+AhpHBTLijOXXDg5wfUFit83OeekPMZrNE+7DZ4FMMbZcxDvmvb7Va6devH/369XPE4URysNsNth8r2Cdhwb6eJKVmkpJh45/4c/wTfy7ffYN8Pc3EqpwvkSG5E6vIEN/CV7uRUiHTZmfsnO2XW4qU95bsoX/LKu7z5kLcV3oyLPm3+cYWILwx3Po5VGrg2rikdKh/sznPJeEQbJ4GrYcX6uFhAT4AzNoUg9sVXdqzCKYPg8xUiO4Ed3wHPoEuDelyUweyBPpYmfFQB/y8i/E9RKX6ZqGMr2+Go+vhm9vNdZ+881hQV4pMH5eKW9t46DQvz9meXY0mPxYgIsSX35+9ngybndjEVI4lphCTkEpMYgrHElOJSUghJjGVYwkpJKVmciY1k92pZ9h9PP/5eaEB3maPVYgfUeVyfo8M8SUixBevyy1YJ27BbjdISMng1Lk0Tp5N59S5dE6eSz9/O42T58xtF7ancaXh/jGJqazdf4r2tcKK5ySkZIrZAjPuM+cfgDkn5YaXwNPHtXFJ6WH1hHYPwaLnYPXH0PJu8CjY/6WF22KYsvpgru1ZRZc+GdrSdcnT9lnm7449E+r2hNu/wm71ITPTjs1uYDMMbDaDTLt5P9NuXPTdTqbdINNmXPKzi/a15bM9++f2HPft578fPHnuslMHAM6m2dh0OKH4/z9ENIa7ZsHXfc31rb67A4ZML1QvpFyeEidxS3FnUnl74W5+/MtcmyDIx5ObGoUzY8NRgByf9GR93j+2T0OsHhasHlZqVAigRoX8P2U5l5ZpJlRZidX571mJVUxiKsnptuw309uPJeV5HIsFKgb6ZPdURZ3vqbr4e4VAH/VKOFimzc7p5IzziU7ahaTn7IUE6MTZC9tPJ6dfMREqioIOIZUyyG43K539Mg7sGRAYAf0/gVrXuzoyKY1aDIVl4+HEHnP+XN3uV3xIVs9JXrL+XD7z4xb2xZ/DMC5NTLKSEjO5sBtXSFJyJDF5JTrnt59PZnpl/sq/jU+wYvCzvT2jtg8hdeuvGG5ev+JiLvv/ENXC7Gma0s8c5vj9ULjjW31Y4yBKnMStpGfa+frPA/zfr3s5m5YJwO2tqvBMj/pUDPLhxobhuSZjRhRhSEGAjye1KwVRu1Le438NwyApJdPstbokwcpKrGITU0m32Yk7k0bcmTQ2H867LU8PC+HBvhd6qi6ae5WVYIUGeDu9mIXNbrB2/ynizqRSKciXttGhbpPQZdjsnL6oFyhHMnQunVPZvUTm9oSUjCL9Aw329SQs0IfQAG9CA7wJC/AmLNCb0AAfws5vCw3w5uDJczzy7cYrHq9SkG8RzlZKvaRjMOtB2L/cvF+vN9zyAQSod1KcxDfYXBB31YfmVwESpysVXQJISs3knUW7HRVlgQyzLuJlr68BmJbZhecz78N+hSLQ5oemFjxzfPe4cN+az3YPCx75Pc6a9/a4M6ks2n78iufh0v8PVduYc8Gm3mom0j/cA7d/DZ7erouplFDiJG5jxZ54xs3dzr7z85KaVQlhXN/GNK9aLnufHo0j6dYwglV/x7F45Rq6X9fOKZNYLRYLIf5ehPh70SAyOM997HaDk+fS8+2xiklI4fiZNDLtBkcTUjiakAKczvNYPp4e2UMCsxOri75HhvgR7OtZ5OQqv+o/zhrDnpZpy9UDlDUELjsZyu4lSiMpNbNI7ZT39zqfAJ1PhgK9cyRAFc4nSWEB3pQP8C7wsMoGkcFEhuwkNjE1z3HsWUND20aHFiluKcV2zoU5/4KU0+DlDz3Gm0OnVOVTnK3dg7D6EzNhj9kCkU0vu3tBe0TaRYdSs2LA+QTCI0eCYi1ConLh57m3R275mIj1ZtKU0Ow+2l83juUeHhcdzyOPBMlSrFV0bXaDjm/95v7/H6p3gMHT4NuBsHs+zLwPbv3CHNopRaZnT1zu0MlkXvt5B4t3mJ/gVAj05pke9bmtZZU8y3daPSy0iw7l5E6Ddi7sNfHwsFAxyIeKQT40rZL3Ppk2O/Fn0y4kVgm5517Fn0kjLdPOgZPJHDiZnG97Ad7W7KIVeSVWUeV88ffO/SudX/WfwoxhT0m3Zff25OwBMucImUPjLiRDWb2FheFhgfL+uZOe0OxeIe8cSVJ5fy88nTS/zOphYWyfhjw0dQMWLj80VASAtLPmHJMNk837kc3NAhAV6rg0LClDylWFRv1g2wxzrlP/iZfdvaA9Ik/cWNf5c3UMA34dB+v/Y97v9Azluj5POTf8wKFE/X+o2RkGfQPTBsOOn8D6IPT/VJU6r4ISJ3GZlHQbnyz7m4kr/iE9046nh4W7O9Tg8RvrEOzrgFW13YCn1eN8IQk/oHye+6Rn2jmedKGnKiuxOpZwvqhFYgoJyRmcS7fxd9xZ/o47m297IX5eOYYARoT48vnK/ZddOPj5mVtJSMkgIWvO0NkLyVBWz1Byuq3w5+5hoXxAzh4g87YPYRf1DGUNlQvx83KPfzTn9WgcySdDWzpkaKiUckf/ghn3w6l9gAU6PgFdntewGCl+1zxiJk5bf4QbxuZeTPkibaNDiQzxdX3Pid0OC5+Ftf8z73d7Ba593LltXqUS9f+hzo3mML3pd8HWH8DqYw4dLmABEclJiZMUO8Mw+HlrDG/8vJNj5//gdKxdgbF9GlKnONY7cDPenh5UDfWnaqh/vvukpNsuJFT5VAs8m5ZJYkoGiSkZ7IrNv1LgpU4lZzBmxtYr7udltWT3+OTsATITn6wkKOz8PsF+RR9a6C6Ka2iolFB2G/wxAZa+YVb+Cq5sfpobfZ2rI5OyqkorqNberKi27jOzgmM+3KLnxJYJcx+DTd+YrfZ+D9rc67z2HCjr/4O7zh3OoX4vuHUS/DgcNk01P9Tp/b6GEBeBEicpVrtik3h5znZW/3MKgCrl/Xixd0NuahRe4t9kO5Oft5VaFQOpVTH/9SuSUjNyJVZr/jnF2gOnrnj8+hFBNIgMNpOgwEuSofPbgnxKfiJUFO4yNFTcTMJhmDUSDv5h3m/YF26eAP6a9yYu1v4RM3Fa/wVc9/Rl1/Fxac9JZro572bHT2CxQr9PoNkg57XnBFYPS8lZkqJRP7Clw8wHzGvD6mPOwSyD/9evhhInKRaJyRn855c9TFl9EJvdwMfTg4e71GZk55paZNZBgn29CI7wol7EhV67VftOMviz1Vd87Ng+jUrOH38RV9s2A+Y+CWmJ4BUAvd6B5kP0BkTcQ71eUD4aTu+HTd9C2/svu7tLetYzUuD7u+DvJeDhBbd/CQ36OK89MTUdCJlpMOdRWPOJWaL8xpf1t6sQlDiJU9nsBt+vO8w7i3ZxOjkDgF5NIni+VwOqlM9/aJo4htuMYRcpDVKTYMEzsPk7837lVjDgMwir5dq4RC7mYYVrHoYFo80iEa3vveJ8lmLtWU87A9/eAQd/B08/uGMq1L7Ree1JTi3vMnuefn7KHGrs6Qtdn3N1VCWGZoaJ0/x18BR9P/qd52dt5XRyBnXDA/n2vnZ8fGcrJU3FJGsMO1wYs57F7ar/iLizw2vh0+vMpMniAZ2egRGLlDSJe2o+BHxD4NQ/sGehq6O5IPkUTO5rJk3eQXDXTCVNrtDmXrhpvHl7+Zuw8j3XxlOCKHESh4tLSuXJ7zdx6yer2HY0iSBfT8b2acjPj11Hh9oVXB1emZM1hj0iJGfp2YgQ3wKVIhcp02yZsOwt+KIHnD4AIdXgnvlw/QtgLR3VP6UU8gmEVsPN26s+cm0sWc7GwVc3m1Uo/crD3XPMtYbENdo/bA7TA/j1FfjzQ5eGU1JoqJ44THqmnS//2M9/f93LuXQbFgsMal2V0TfVIyzQx9XhlWklqvqPiLs4fcCcSH14jXm/ye1m1S/fEJeGJVIgbR+AVR+avTvHNkJUC9fFknDY7Gk6tQ8Cw+Gu2RDe0HXxiKnjk2aRjmVvwOIXzDlPV5gTV9YpcRKHWLo7jlfn7uCfE+cAaFGtHONuaUTTKuVcG5hkK1HVf0RcbfP38PPTkH4GfILNhKnpQFdHJVJwIZWh8a2w5Xuz1+nWz10Tx8l9ZtKUeBhCqsKwnzTE1Z10fgYyU+H392H+KLB6Q6u7XR2V21LiJFflwIlzvDpvB7/uigOgQqAPz/WsT/8WlfFQb4aIlDQpCeabh60/mPerXgMDPoXyNVwZlUjRXPOwmThtn2UOywqpUrztH98BU/rB2eMQVttMmoo7Brk8i8Vc78uWbvZQzn3c7HlqdoerI3NLSpykSM6lZfLR0r/5fOV+0m12PD0sjOgYzb+ur02Qr8b9i0gJdPBPmDkSEg+Z68p0GQMdnwKr/lVKCRXVHGpcBwdWwtr/QbdXiq/toxtg6gBIOQ2VGsGw2RBYqfjal4KzWKD7a2bP07rPYfZDZs9T4wGujszt6L+BFIphGMzZfIzx83cRm2QultepbkVeurkhtSvlvziriIjbsmXA8rfMylKG3exdGvA5VG3j6shErl77R8zEaf1XZjVIn2L4X33wT/hmoDnUtXIruPNHLQ7t7iwW6PmO2fO0YTLMuM9Mnhrc7OrI3IoSJymwHceSeHnOdtYeOAVAtVB//n1zQ25sUAmLFk8TkZLo5D6Yeb9Z6Qug+Z3Q8y3wCbr840RKijo3mcPkTv4Nm76BdiOd297fv8C0oZCZ8v/t3Xd8VFXex/HPTCoBkkBCSAKh1wAKggpIs9CbZfFRFLBsUVBEVkVdfeyCurquZUVRsbCK7gMiRaOAgrCAIALSQYQEIXRIAiF17vPHIZOEBBIgmTsz+b5fr/vizr1nZn4znCTzm3Pu75jRrps/1c+Tr3A6YdCrpmDEL9PhP7fBTZ9Aiz52R+Y1VI5cynT0RA6PzVrPoNeXsHLXEaoFBfBAnxZ8e38PeifWVdIkIr7HsmDNNJjc3SRNoRHwh6lw7b/0IU/8i9NprnUCsyCuK7/ynmvTbLO4bd5JaNYbbvmPfp58jTMAhr4Jba4DVy58divs+N7uqLyGRpzkjPJdFp+sTOHlb7dyLDMXgEEXxfHogNbER1azOToRkfOUeQTmjoNNX5rbDbuZAhC6aF381cU3w3fPmBL7W+ZB4pCKf45102HWaLDyIXGome4aGFzxzyOVLyAQrp9ipjFvmQuf3gy3/h806mZ3ZLbTiJOUauXOIwx6fSmPz9rAscxcWsXWZPqfO/PG8EuUNImI79r5A7x1hUmanIGm0tio2UqaxL8Fh0GnO81+ZSyIu+pd+OIvJmm6eDjc8L6SJl8XEAR/eB+a9zEjiP++EVJ+tDsq2ylxkmL2pWUx9tM13Pj2cjanphNRLYinh7Zh7r3d6NxEawCJiI/Ky4H5/wsfDoGMveaajz8uMAtAOgPsjk6k8l32J3AGwe4V8PtPFfe4//2nWfMM4NI/mWleqkTpHwJD4MaPoUkvyD0B//6DqZZYhSlxEgCy8/J58/tfuerlRcxetxeHA4Zf3oDvH+jFyC6NCAxQVxERH3VwG7x3jfmAhwWXjIK//ADxHeyOTMRzasZCu2FmvyJGnSwLvnvWfCEB5kuIAS+Za6rEfwSFwk2fQsMrIDsdPr4OUn+xOyrb6CsBYeHm/Tw9dxPJhzMB6NSwFk8OaUPbehE2RyYicgEsC1ZPhaRHzVSTarVgyOvQerDdkYnYo8sYWPeJmap6LAUiG5zf41gWJD0CP75lbl/9v9D9rxUXp3iX4DAY/hl8fD38vtIsanzbPIhpbXdkHqevBaqw3w4e5/apK7nzw59IPpxJTM0QXv2f9vznri5KmkTEt504BNOHw9z7TdLUpBfcvVxJk1RtsW3Nz4KVDz++fX6P4cqH2fcWJk39X1TSVBWE1DQFIuI7QOZhM+350K92R+VxSpyqoOPZeUz8ejN9X/2B77ceJCjAwV09m/LdA724tkM9lRcXEd/260J4qyts/cos4NjnObj1CwiPszsyEft1ucf8u/pDyEo/t/vm55p1z9Z8DA6nuZ6psteFEu8RGgG3zoS67eDEAfhwMBz5ze6oPEqJUxViWRZfrPmdq/6+iLcX/0ZuvsWVLevw7f09ebh/K2qE+MjMTVc+juSl1DuyHEfy0spdk0JEfEdulpk+NO16OL4folvCHxdC13t03YVIgaZXm5+NnAyTAJVXbhZ8NgI2zDAVKf/wPnS4tfLiFO8UVhtGzoI6rUyhnQ+HmGmfVYSPfFKWC7VhTxpPzN7I6uSjADSKCuN/BydyVau6Nkd2jjbNhqQJBKbvpRNA8lsQHg/9XqicdSlExDcc2Awz/gj7N5jbl/4Jej9t5uaLSCGnE7qMhjn3wYrJcNlfyq6Cl33cTH3duRgCQ02ltRZ9PBOveJ/q0TByNnwwAA7/apKn278yn8f8nL6C83NHTuTwyMz1DH5jKauTjxIWHMBD/Vryzf09fDNp+nwkpO8tfjw91RzfNNueuETEPpYFP74D7/QySVNYNNz8GQz8u5ImkTO56H/Mz0paCmyZc/a2J4+ZSmo7F0NwDbjl/5Q0CdSsa5KnWo3g6M5TSz3stzuqSqfEyU/l5bv4cNkuer30PZ+uTMGyYGj7eL77ay9G92pGSKCPrVviyoekCYBVyslTx5Ie1rQ9kark+AH45Eb4+kHIy4JmvWH0cmjZz+7IRLxbUDW49I9mf9kb5guI0pw4BB8OMpXUQiNg5JfQuLvn4hTvFlEPRs2BiAQ4vB0+GgonDtsdVaVS4uSHlu84zKDXl/LE7I2kZ+WRGBfOf+7qwj9v6kBsRKjd4Z2f5GUlR5qKsSB9Dyx7HY4mg8vlsdBExAbbvoF/dYHt30JACPR/CW75D9SIsTsyEd9w6Z3mZ2fPT7B7Zcnz6Xthan/Ytx6q14HbvoL6nTwfp3i3yAYwajbUjIODm+HjoZB5xO6oKo2ucfIje4+d5LmvNjPvl1QAIsOCeKBPS26+rAEBTh+vlHe8nMO/C54wW2AoRDWD6OYQ1RyiW0B0M7MfUqNyYxWRypN7Er59HFZNMbdj2sAN70LdRHvjEvE1NWLgohtNgYhlr+PodOepokvhEJlgiqwcS4bwemakKbq53RGLt6rdxIw8TR1gEu1pN5gCEqH+t7SNEic/kJWbz5QffuPNRb+SlevC6YBbLm/IX/u0IDIs2O7wKkaNcl6PFdEAju8z03b2byi8ULyo8HqnkqoW5g9B9KnEqma8Km+JeLN9600BiINbzO3OY8zCm0E+OpIuYrcuY0zitGUOgVvmFBZdcjjBckGtxiZpqtXQ7kjF20U3N33lg4Gw92f49zBTutzPvqxW4uTDLMti/qb9PDNvE7uPnATgska1eXJIGxLjw22OroI17GqSpzOOPDlMNZf71pq52mkpcGg7HNp26t/tZv7tiYNmSl/6HnOha1FBYYWjVNEtCpOrqGa6yFzETi4XrPgXLHwK8nPM74Jr/wXNrrE7MhHfdmh76cetU9Pdu49X0iTlVzfRjDR9OBh2/wif3gTDP/erz1BKnHzUrweO89ScjSzZfgiA2PBQHh3YmsEXxfnnArYOp5ljXWridOr19psEzlNFL2o3MVuLvsWbZh4xpTMLkqrDv5p/j/wGuZmw7xeznS4ioci0v+aFo1U148Af328Rb5GeCrPuht++N7dbDoAhr5tyuCJy/txFl87EAYsmQftbCv+2ipQl7mIY8QV8OBR2LTFl7G+e7jczA5Q4+ZiMrFxeW7idqf/dRZ7LIjjAyZ96NGZ0r2ZU95UFbM/Hmo/NtDtnEITVMtW0CoTHm6SpPOs4hdWGsMsg4bLix/NzTVGJQ9vMyNShbXDoVzi0FU4ehbTdZtvxXfH7Bdcofdpf7aZ+80tCxDab58Lse+HkEQisBv2eh46368sKkYpQ3qJLyctUSU/OTb2OcOv/wcfXmy+9Ph8J/zMNAn3/8hE//qTtX1wui5lr9jDp6y0cOp4NwDWtY3h8UCINo6rbHF0lS/sdvvmb2b/mSeh8N3m//cDaJd/QvntfApv0uPBvwwKCTPGI6GYlz504XCSZ2l44WnV0F+Qch9S1ZivGYSrNFB2dKihSUSNGH/xEzibnBHzzKKz+wNyOvQhueA/qtLA1LBG/Ut6iS+VtJ1JUg84w/DNzrdP2b+D/bodhH5jPWz5MiZMPWLf7GE/M3sja3ccAaBJdnccHJ3JlyypQdteyzOrm2elQ/zLofDc4A7AadmPPxnQubtit8qcQVI8yW4POxY/n5ZhF34peS1WQYGWlmWpEx5Lh1wXF7xcSXvI6qujmZmphYEjlvhYRb7d3jSkAcfhXwAFXjIUrH/OLbypFvEp5iy6Vt53I6Rp3h5s/gU9ugi1zYeafTRVUH576qcTJix06ns1LSVv5fPVuLAuqBwcw9urm3H5FY4IDq0j1t7X/NolHQIi5GNybftgCg6FOS7MVZVlm0UD3tL8iidWxZJME7llttqIcTohsWHLaX1Rzcz3HhY5SufLNlIvj+80fwoZdvev9lKrNlQ/LXoPvngVXnqlyed1kaNLT7shE/FPDrmaqe3oqpS8uf6roUsOuno5M/EnTq+DGj+CzW2HjTPMF8eDXcSQvLSx/XxEzhzxEiZON8l0WK3ce4UBGFjE1Q7mscW0CnA5y8118vDyZfyzYRkZWHgDXd6jHw/1bERNeha6bSdsDSY+a/ase8501JBwOqFHHbI2uKH4uL9sUoji0rch1VKeKVGSnmxGsozvNsHZRoZHFE6qCaX+1G5dv2HvTbHMRcNH57OHx0O+F8l0bJlKZ0n6HL+4yFxIDtB4Cg/9prkkUkcrhDDB/Az4fiSmyVDR5KqXoksj5atkPhk2Fz0fBuk9h05cE5mYWlr/3oc8jSpxskrQhlafmbCI1Lct9LC4ilP/plMBXG1LZtv84AG3rhfPUkDZ0bFjFPkC4p+ilQf1LzVoT/iAwBGJam60oyzIjQaeXTz+0DY7thqxj8PtKsxXlCDDJ0+nT/qJbFH7o3DT71B/G075RTE81x2/8yCd+WYmf2viF+VnPSoOg6tD/Behwq64DFPGExCHmb0CpX6yVs+iSSHm0Hgyd74Llb5oqxkX50OcRJU42SNqQyt3Tfi4xMJ6alsWrC82aCrWrB/Ng35bc2CmBAGcV/ACx9hP4db6Zojf0Tf//xsvhgJqxZmvco/i53JNweEfx8ukFo1W5J8yxw7+WfMywKKjdDPavp/RpGBbggKSHodVA/3+PxbtkZ8DXE8x0XID4S8zc96im9sYlUtUkDoFWAyu+6JJIUa5880VZqXzn84itiVOjRo1ITk4ucXz06NG8+eabJY4vWrSIK6+8ssTxzZs306pVq0qJsaLluyyemrOp1I+xBcKCA1hwf09q16iiF0On74WkR8z+lY+WvIaoqgmqBrFtzVaUZZn3yn0dVZFrqdJ/h8zDZjsrlZuVSnSm6+p2r4KZfzSVKXFA979Cr4d9vtqSiM/ydNElqXr8pPy9rYnTqlWryM/Pd9/esGEDvXv3ZtiwYWe939atWwkPD3ffrlOnTqXFWNFW7jxSbHpeaTJz8tm6P4MuNaI8FJUXsSyYM85M0avXEbrcY3dE3svhgIh6ZmvSq/i5nFMjUWumwcp3yn6sdZ+aqX7hcZUSqlRBZ7qurkEX2DgLrHyzsPR1b5e8FlBERPyLn5S/tzVxOj3hmTRpEk2bNqVnz7NXUYqJiSEyMrISI6s8BzLOnjSdazu/s266KYwQEAxD/wUBmk16XoKrm9W7s9LLlzit/bfZ6nWC1oOg1SDfKcYh3ueM19XthQ0zzH7bG2DgK1At0tPRiYiIp/lJ+Xuv+VSak5PDtGnTGD9+PI4yLgru0KEDWVlZJCYm8thjj5U6fa9AdnY22dnZ7tvp6ekA5ObmkpubWzHBn4OosPK95VFhgbbEZ6uMVAKTJuAA8ntMwFWrKZzhPSh4b6rce3Su4i8lsGY8ZKTiKGWCqAUQEo4V1Rzn3tWw5yezLXgSK6o5rpYDsVoMwIpvb8qlV1Hqb+fAlU/g1xMAi9J+k1sAobXIG/Sm+WJE72mp1OfE09TnpFKV+XnElL/Pi7/U438XzqXPOyzLOtvlNh7z+eefM3z4cFJSUoiPjy+1zdatW/nhhx/o2LEj2dnZfPzxx0yePJlFixbRo0ePUu/z5JNP8tRTT5U4/sknnxAWFlahr6E8XBY89XMAx3KAM3ysiAyGJy7Jp0rVhLAsLv/tH8Smr+VoWBOWtHgcy6E51hUh7tgqLt35OlC8xxX84K9qfC+pkZcSknuMuLSfiT22mjrHN+G0CqfRngyqxb6IS0iN6Mihmq2wHF7znYt4maiMzXT7dWKZ7ZY2e4TDNVuX2U5ERPxDeT+PeFpmZibDhw8nLS2t2KVApfGaxKlv374EBwczZ86cc7rf4MGDcTgczJ49u9TzpY04JSQkcOjQoTLfnMryzcb93Dt9HVDqqgm8ftPF9G3j3UOVFc2x/nMCZ4/GCggm787voM7Zi33k5uYyf/58evfuTVCQLigvi2PLXAK+fRRHRuH1JlZ4PfJ7P4fValDJO2Sl4dixAOfWr3DsWIAj50Th/UIjsJr1wdVyAFaTq8y0QD+n/lYOuZk4dq/E+dO7OLcnldk879q3sdrc4IHAfJP6nHia+px4wjl/HvGA9PR0oqOjy5U4ecXXxsnJySxYsICZM2ee8307d+7MtGnTzng+JCSEkJCQEseDgoJs+8UwqH19AgMDSqzjFBsRyhODE+nXtopdoJ+xD741C906ej1MUHy7ct/Vzv9Hn9LuOmgzpFiFM0fDrgSeqXJSUDS0v8lsuVmwczFsmQtbvsKReQjHhv/g3PAfCAyFJlea66Ja9Ifq/l3QRP2tiLwcM61z5w9m270SXOWf7hAYUQ/0XpZJfU48TX1OKtWpzyOnl78/4+cRDziX/u4VidPUqVOJiYlh4MCB53zfNWvWEBfne4lGv7Zx9E6MZeXOIxzIyCKmZiiXNa5d9dZssiyYe79Z4DWuPXS9z+6I/Jcz4PxKfAaFQou+Zhv0qvmAvGUubJ4Dx5Jh29dmczihQVezBkOrgVCrYYW/BLGRKx9S1xYmSikrSi5iGF4PGnWDbd+YBW1LXXjBzGOnYVcPBC0iIl7Hh8vf2544uVwupk6dyqhRowgMLB7OI488wp49e/joo48AePXVV2nUqBFt2rRxF5OYMWMGM2bMsCP0CxbgdNClqX9/Q1+m9f+BrV+BMwiufUtV9LydMwAadjFbn2dh/0bYMg+2zIF96yF5qdm+eQRi20GrwSaJqtvGlE8X3+FywcHNhYnSrv+aZQKKCos2CzYXbLWbmP9nd1U9B6VOSO43yaf+UIqIiIAXJE4LFiwgJSWFO+64o8S51NRUUlJS3LdzcnJ44IEH2LNnD9WqVaNNmzbMmzePAQMGeDJkqSgZ++GrB81+rwlQN9HeeOTcOByFC/P2mgBHk08lUfMgZZlJpPath0XPQ61GpsR5q4GQcLk+NHsjy4Ijv5lpmTt/gJ1LIPNQ8TYhEWZEqSBRimldekKcOARu/Kj0dZz6TTLnRUREfIztiVOfPn04U32KDz74oNjthx56iIceesgDUUmlKzZF72K4YpzdEcmFqtUQuow224nDZvrelnmw4zs4uguWv2G26nWgZX+TSDXuaaYCij2O7T41mrTE/Ju+p/j5oDCzYG1BohR3cfmT3sQhJlEucl0dDbsqaRYREZ9le+IkVdSGGbB1npmiN/RfEKALUf1K9SjocKvZso+b5GnLXNiWBCcOws8fmS24BjS7xiRRLfpAaITdkfu34wcKp97t/AGO7ix+PiAY6l9WmCjV6wiBwef/fOd7XZ2IiIgXUuIknnf8AHz1gNnv+ZCZ6iX+K6SGGX1IHAL5ubBraeGUvoy9sGmW2ZxB5kN2q0HQcgCE+17RF69z8qi5NqkgUTq4ufh5hxPiLylMlBIuh2DPr28nIiLiC5Q4iWcVTNE7eRRiL4Ju99sdkXhSQBA0vdJs/V+E1DWwea5Jog5tNSNTO76DeeOh/qWnKvQNhuhmdkfuG7KPm2p3Bdcppa6jRGW7uu0KE6WGXTTKJyIiUk5KnMSzNs40U7acgaeq6GmKXpXldJqpYPU6wjVPwKHtp9aKmge/ryrcFjwJ0S3NWlGtBpoRElXoM3KzzHtUMKK05ydw5RVvE9W8MFFq1N3v19oSERGpLEqcxHOOH4R5p6bo9dAUPTlNdHMzAtntfkhPNWXqt8w1CcGhrbBkKyx5GWrGmwSq9SBoeEXVSr7z82DvmsIRpd0/Ql5W8TYRCaboRuMeZupjeLw9sYqIiPgZJU7iGZZlpl+dPGLW9+k+3u6IxJuFx8Gld5rt5DHYPt+sFbV9gbkuatUUs4VGQot+JpFqdjUEV7c78orlcsH+DYUjSsnLICejeJsadYuPKNVqpBE5ERGRSqDESTxj4xewebaZoqcqenIuqkXCRcPMlptlRls2z4GtX5t1hn6ZbrbAUGh6lUmiWvT3zSlplmWmLBaMKO1aYq4HLCo00owkFYwqRbdQoiQiIuIBSpyk8h0/WFhFr/sDEHeRvfGI7woKhRZ9zebKN1PVtswzidSxZDO9b+tXplpcg66F10VFNrA78jM7mly8RPjxfcXPB9cw6x8VjCrVbau1kERERGygxEkq31cPQOZh84Gv+1/tjkb8hTPAJBQNu0KfZ2H/xlPFJebCvvWQvNRsSQ+bCo6tBplEKibR3hGajH2wc0nhqNKx5OLnA0KgweWnEqWeEN9BI7QiIiJeQImTVK6NX5g1ehwBcO2/LmwxTZEzcThMsZHYttDrYTi6C7acKi6Rshz2/WK2Rc+ba4BaDTJbwmWVP3qTecRMuSsYUTq0rfh5Z6CpLFgwolT/MjOyJiIiIl5FiZNUnhOHCqvodf8rxF1sbzxSddRqBF1Gm+3EIXM91JZ5Zo2oo7tg+Rtmq14HWvY3SVTjnqUnLK58HMlLqXdkOY7kcGjS4+zJVla6SdZ2/mBGlfZtoPhaSg4zXbVgRKlBZwipWbGvX0RERCqcEiepPF89aC7ej2kDPR60OxqpqqpHwyUjzJZ9HHYsNEnUtiQ4cRB+/shswTWg2TXQejA0720Wht00G5ImEJi+l04AyW+Z8t79XoDEIebxc0+aa63cayn9DFZ+8RjqtCqy6OwVEFbb0++CiIiIXCAlTlI5Nn1pFrt1BMC1b2qKnniHkBqQONRs+bmwa2nhorsZqWZa6aZZ4Awyyc7+9SUfIz0VPh8BbW+A4wdM0pSfU7xNrcbFS4TXrOuJVyciIiKVSImTVLwTh2HeqSIQ3cebi9tFvE1AEDS90mz9XzILyxYUlzi0rfSkCXBPu9swo/BQzbjii856cxU/EREROS9KnKTiff2gmQIVk6gpeuIbnE6o39Fs1zwBaz+FWXeVfb/Oo6HTnRDVVGspiYiI+DklTlKxNs0238Q7AmDomxAYYndEIueuvOW/63WE6GaVG4uIiIh4BafdAYgfyTwC88ab/W7joN4ltoYjct5qlPOapPK2ExEREZ+nxEkqztcPmSl6dVpBzwl2RyNy/hp2NdXzONP0OweE1zPtREREpEpQ4iQVY/NcWP8fcDhPLXSrKXriw5wBpuQ4UDJ5OnW736TKXzxXREREvIYSJ7lwmUdg7v1m/4r7zHUfIr4ucQjc+BGExxU/Hh5vjhes4yQiIiJVgopDyIVLehhOHIDoltDzYbujEak4iUOg1UDyfvuBtUu+oX33vgQ26aGRJhERkSpII05yYbZ8Bb98VjhFLyjU7ohEKpYzAKthN/bU7oLVsJuSJhERkSpKiZOcv8wjMHec2e96L9TvZGs4IiIiIiKVRYmTnL+kR+D4fohuAb0etTsaEREREZFKo8RJzs/Wr+GX6WaK3lBN0RMRERER/6bESc7dyaMwZ5zZ73IPJFxqazgiIiIiIpVNiZOcu6RH4fg+iGoOV2qKnoiIiIj4PyVOcm62fQPrPgEcp6roVbM7IhERERGRSqfEScrv5DGYc5/Z7zIGEi6zNRwREREREU9R4iTl983fICMVoprBVY/ZHY2IiIiIiMcocZLy2T4f1k4DHDD0TU3RExEREZEqRYmTlC0rDWaPNfudR0ODzvbGIyIiIiLiYUqcpGzfPAoZe6F2U03RExEREZEqSYmTnN32BbCmyBS94DC7IxIRERER8TglTnJmWWkwp2CK3t3QsIu98YiIiIiI2ESJk5zZt49B+h6o1RiuetzuaEREREREbKPESUr360L4+SPcC91qip6IiIiIVGFKnKSkrPTCKnqX/wUadrU3HhERERERmylxkpLmPw7pv0OtRnD1/9odjYiIiIiI7ZQ4SXE7vofVH5j9oW9CcHVbwxERERER8QZKnKRQdgbMvtfsX/ZnaNTN3nhERERERLyEEicp9O3jkLYbIhvC1U/YHY2IiIiIiNdQ4iTGju9h9VSzP/RNCKlhbzwiIiIiIl5EiZOcmqJ3qorepX+Cxt3tjUdERERExMsocRKY/wSkpUBkA7jmSbujERERERHxOkqcqrrfFsNP75n9IW9oip6IiIiISCmUOFVl2cdh9j1mv9Od0KSnvfGIiIiIiHgpJU5V2YIn4VgKRDSA3k/ZHY2IiIiIiNdS4lRV7VwCq6aY/aGvQ0hNe+MREREREfFiSpyqopwT8OUYs9/xdmjSy9ZwRERERES8nRKnqmjBk3AsGSISoPfTdkcjIiIiIuL1lDhVNbuWwsp3zP6Q1yA03N54RERERER8gBKnqqToFL1LRkHTq+yNR0RERETERyhxqkoWPg1Hd0F4fejzrN3RiIiIiIj4DCVOVcWu/8KPk82+puiJiIiIiJwTJU5VQU5mkSl6I6HZ1fbGIyIiIiLiY5Q4VQXfPQNHd0J4PU3RExERERE5D0qc/F3ycljxltkf/BqERtgbj4iIiIiID1Li5M/cU/Qs6HArNL/G7ohERERERHySEid/9t2zcGQH1IyHPs/ZHY2IiIiIiM9S4uSvUlbAin+Z/cH/hGqRtoYjIiIiIuLLlDj5o9yTMGs0YEH7W6BFH7sjEhERERHxaUqc/JF7il4c9H3e7mhERERERHyeEid/k/IjLH/T7GuKnoiIiIhIhVDi5E9yTxZW0bt4OLToa3dEIiIiIiJ+QYmTP/n+eTi8HWrEQj9N0RMRERERqShKnPzF7lWw/A2zP/hVqFbL1nBERERERPyJEid/kJsFX44GywUX3QQt+9sdkYiIiIiIX1Hi5A8WPQ+HtkGNutBvot3RiIiIiIj4HSVOvu73n2DZ62Z/0KsQVtvWcERERERE/JESJ1+Wm2UWurVc0O5GaDXA7ohERERERPySEidftngSHNoK1WOg/wt2RyMiIiIi4reUOPmqPavhv/80+4P+oSl6IiIiIiKVSImTL8rLLjJFbxi0HmR3RCIiIiIifs3WxKlRo0Y4HI4S25gxY854n8WLF9OxY0dCQ0Np0qQJkydP9mDEXmLxC3BwC1SvA/1ftDsaERERERG/Z2vitGrVKlJTU93b/PnzARg2bFip7Xfu3MmAAQPo3r07a9as4dFHH2Xs2LHMmDHDk2Hba8/PsPRVs68peiIiIiIiHhFo55PXqVOn2O1JkybRtGlTevbsWWr7yZMn06BBA1599VUAWrduzU8//cTf//53brjhhsoO13552fDlGLDyoe0N0Hqw3RGJiIiIiFQJtiZOReXk5DBt2jTGjx+Pw+Eotc3y5cvp06dPsWN9+/blvffeIzc3l6CgoBL3yc7OJjs72307PT0dgNzcXHJzcyvwFVQ+56JJBBzYhBUWTV7v58HH4q9IBf93vvZ/KL5J/U08TX1OPE19TjzJm/rbucTgNYnTrFmzOHbsGLfddtsZ2+zbt4+6desWO1a3bl3y8vI4dOgQcXFxJe4zceJEnnrqqRLHv/32W8LCwi44bk+JyNxJj62vArCq7s2kLvrR3oC8RMH0ThFPUH8TT1OfE09TnxNP8ob+lpmZWe62XpM4vffee/Tv35/4+Piztjt9NMqyrFKPF3jkkUcYP368+3Z6ejoJCQn06dOH8PDwC4zaQ/JzCHzvahy4cLUeSofrn6CD3THZLDc3l/nz59O7d+9SRxpFKpL6m3ia+px4mvqceJI39beC2Wjl4RWJU3JyMgsWLGDmzJlnbRcbG8u+ffuKHTtw4ACBgYFERUWVep+QkBBCQkJKHA8KCrL9P6rclrwIBzdDWBTOQa/g9JW4PcCn/h/F56m/iaepz4mnqc+JJ3lDfzuX5/eKdZymTp1KTEwMAwcOPGu7Ll26lBjS+/bbb+nUqZPtb3ql2bsWlrxs9ge+DNWjbQ1HRERERKQqsj1xcrlcTJ06lVGjRhEYWHwA7JFHHmHkyJHu23fddRfJycmMHz+ezZs38/777/Pee+/xwAMPeDpsz8jLKayilzgU2lxnd0QiIiIiIlWS7YnTggULSElJ4Y477ihxLjU1lZSUFPftxo0b89VXX7Fo0SLat2/PM888w2uvvea/pciXvAz7N0BYFAx42e5oRERERESqLNuvcerTp4+7wMPpPvjggxLHevbsyc8//1zJUXmB1F9gyd/N/oC/Q406Z28vIiIiIiKVxvYRJylFfi7MGg2uPGg9RFP0RERERERspsTJGy15Gfavh2q1TUGIM5RaFxERERERz1Di5G32rYcfXjL7A16CGjH2xiMiIiIiIkqcvEp+Lsy620zRazUI2vpp0QsRERERER+jxMmbLP2HGXGqVgsGvqIpeiIiIiIiXkKJk7fYtwEWv2j2+78ENevaG4+IiIiIiLjZXo68SnPlQ/IySN9jkiZXLrQcCO3+YHdkIiIiIiJShBInu2yaDUkTIH1vkYMOaNlPU/RERERERLyMEic7bJoNn48ETl/414LZYyE0EhKH2BCYiIiIiIiURtc4eZor34w0lUiaikh62LQTERERERGvoMTJ05KXnTY973SWueYpeZnHQhIRERERkbNT4uRpx/dXbDsREREREal0Spw8rUY5y4yXt52IiIiIiFQ6JU6e1rArhMcDZ6qc54DweqadiIiIiIh4BSVOnuYMgH4vnLpxevJ06na/SaadiIiIiIh4BSVOdkgcAjd+BOFxxY+Hx5vjKkUuIiIiIuJVtI6TXRKHQKuBpnre8f3mmqaGXTXSJCIiIiLihZQ42ckZAI272x2FiIiIiIiUQVP1REREREREyqDESUREREREpAxKnERERERERMqgxElERERERKQMSpxERERERETKoMRJRERERESkDEqcREREREREyqDESUREREREpAxKnERERERERMqgxElERERERKQMSpxERERERETKoMRJRERERESkDEqcREREREREyhBodwCeZlkWAOnp6TZHIhciNzeXzMxM0tPTCQoKsjsc8XPqb+Jp6nPiaepz4kne1N8KcoKCHOFsqlzilJGRAUBCQoLNkYiIiIiIiDfIyMggIiLirG0cVnnSKz/icrnYu3cvNWvWxOFw2B2OnKf09HQSEhLYvXs34eHhdocjfk79TTxNfU48TX1OPMmb+ptlWWRkZBAfH4/TefarmKrciJPT6aR+/fp2hyEVJDw83PYfOKk61N/E09TnxNPU58STvKW/lTXSVEDFIURERERERMqgxElERERERKQMSpzEJ4WEhPDEE08QEhJidyhSBai/iaepz4mnqc+JJ/lqf6tyxSFERERERETOlUacREREREREyqDESUREREREpAxKnERERERERMqgxElERERERKQMSpzEa0ycOJFLL72UmjVrEhMTw7XXXsvWrVuLtbEsiyeffJL4+HiqVatGr1692LhxY7E22dnZ3HvvvURHR1O9enWGDBnC77//7smXIj5o4sSJOBwOxo0b5z6m/iYVbc+ePdx6661ERUURFhZG+/btWb16tfu8+pxUlLy8PB577DEaN25MtWrVaNKkCU8//TQul8vdRv1NLsQPP/zA4MGDiY+Px+FwMGvWrGLnK6p/HT16lBEjRhAREUFERAQjRozg2LFjlfzqSqfESbzG4sWLGTNmDCtWrGD+/Pnk5eXRp08fTpw44W7z4osv8sorr/DGG2+watUqYmNj6d27NxkZGe4248aN44svvmD69OksXbqU48ePM2jQIPLz8+14WeIDVq1axTvvvMNFF11U7Lj6m1Sko0ePcsUVVxAUFMTXX3/Npk2bePnll4mMjHS3UZ+TivLCCy8wefJk3njjDTZv3syLL77ISy+9xOuvv+5uo/4mF+LEiRNcfPHFvPHGG6Wer6j+NXz4cNauXUtSUhJJSUmsXbuWESNGVPrrK5Ul4qUOHDhgAdbixYsty7Isl8tlxcbGWpMmTXK3ycrKsiIiIqzJkydblmVZx44ds4KCgqzp06e72+zZs8dyOp1WUlKSZ1+A+ISMjAyrefPm1vz5862ePXta9913n2VZ6m9S8SZMmGB169btjOfV56QiDRw40LrjjjuKHbv++uutW2+91bIs9TepWID1xRdfuG9XVP/atGmTBVgrVqxwt1m+fLkFWFu2bKnkV1WSRpzEa6WlpQFQu3ZtAHbu3Mm+ffvo06ePu01ISAg9e/Zk2bJlAKxevZrc3NxibeLj42nbtq27jUhRY8aMYeDAgVxzzTXFjqu/SUWbPXs2nTp1YtiwYcTExNChQwemTJniPq8+JxWpW7duLFy4kG3btgGwbt06li5dyoABAwD1N6lcFdW/li9fTkREBJdffrm7TefOnYmIiLClDwZ6/BlFysGyLMaPH0+3bt1o27YtAPv27QOgbt26xdrWrVuX5ORkd5vg4GBq1apVok3B/UUKTJ8+nZ9//plVq1aVOKf+JhXtt99+46233mL8+PE8+uijrFy5krFjxxISEsLIkSPV56RCTZgwgbS0NFq1akVAQAD5+fk899xz3HzzzYB+x0nlqqj+tW/fPmJiYko8fkxMjC19UImTeKV77rmHX375haVLl5Y453A4it22LKvEsdOVp41ULbt37+a+++7j22+/JTQ09Izt1N+korhcLjp16sTzzz8PQIcOHdi4cSNvvfUWI0eOdLdTn5OK8NlnnzFt2jQ++eQT2rRpw9q1axk3bhzx8fGMGjXK3U79TSpTRfSv0trb1Qc1VU+8zr333svs2bP5/vvvqV+/vvt4bGwsQIlvGA4cOOD+RiM2NpacnByOHj16xjYiYKYIHDhwgI4dOxIYGEhgYCCLFy/mtddeIzAw0N1f1N+kosTFxZGYmFjsWOvWrUlJSQH0O04q1oMPPsjDDz/MTTfdRLt27RgxYgT3338/EydOBNTfpHJVVP+KjY1l//79JR7/4MGDtvRBJU7iNSzL4p577mHmzJl89913NG7cuNj5xo0bExsby/z5893HcnJyWLx4MV27dgWgY8eOBAUFFWuTmprKhg0b3G1EAK6++mrWr1/P2rVr3VunTp245ZZbWLt2LU2aNFF/kwp1xRVXlFhiYdu2bTRs2BDQ7zipWJmZmTidxT/mBQQEuMuRq79JZaqo/tWlSxfS0tJYuXKlu82PP/5IWlqaPX3Q4+UoRM7g7rvvtiIiIqxFixZZqamp7i0zM9PdZtKkSVZERIQ1c+ZMa/369dbNN99sxcXFWenp6e42d911l1W/fn1rwYIF1s8//2xdddVV1sUXX2zl5eXZ8bLEhxStqmdZ6m9SsVauXGkFBgZazz33nLV9+3br3//+txUWFmZNmzbN3UZ9TirKqFGjrHr16llz5861du7cac2cOdOKjo62HnroIXcb9Te5EBkZGdaaNWusNWvWWID1yiuvWGvWrLGSk5Mty6q4/tWvXz/roosuspYvX24tX77cateunTVo0CCPv17LsiwlTuI1gFK3qVOnutu4XC7riSeesGJjY62QkBCrR48e1vr164s9zsmTJ6177rnHql27tlWtWjVr0KBBVkpKiodfjfii0xMn9TepaHPmzLHatm1rhYSEWK1atbLeeeedYufV56SipKenW/fdd5/VoEEDKzQ01GrSpIn1t7/9zcrOzna3UX+TC/H999+X+rlt1KhRlmVVXP86fPiwdcstt1g1a9a0atasad1yyy3W0aNHPfQqi3NYlmV5fpxLRERERETEd+gaJxERERERkTIocRIRERERESmDEicREREREZEyKHESEREREREpgxInERERERGRMihxEhERERERKYMSJxERERERkTIocRIRERERESmDEicREfF7jRo14tVXX63U57jtttu49tprK/U5RETEPkqcRESk0t122204HA7uuuuuEudGjx6Nw+HgtttuK/fj7dq1C4fDwdq1a8vVftWqVfz5z38u9+OX5u233+biiy+mevXqREZG0qFDB1544QX3+X/+85988MEHF/QcIiLivZQ4iYiIRyQkJDB9+nROnjzpPpaVlcWnn35KgwYNKuU5c3JyAKhTpw5hYWHn/Tjvvfce48ePZ+zYsaxbt47//ve/PPTQQxw/ftzdJiIigsjIyAsNWUREvJQSJxER8YhLLrmEBg0aMHPmTPexmTNnkpCQQIcOHYq1TUpKolu3bkRGRhIVFcWgQYPYsWOH+3zjxo0B6NChAw6Hg169egGF0+UmTpxIfHw8LVq0AIpP1Vu0aBHBwcEsWbLE/Xgvv/wy0dHRpKamlhr7nDlzuPHGG7nzzjtp1qwZbdq04eabb+aZZ55xtyk6Va9gROz0rSBOgGXLltGjRw+qVatGQkICY8eO5cSJE+f2poqIiMcocRIREY+5/fbbmTp1qvv2+++/zx133FGi3YkTJxg/fjyrVq1i4cKFOJ1OrrvuOlwuFwArV64EYMGCBaSmphZLxhYuXMjmzZuZP38+c+fOLfHYvXr1Yty4cYwYMYK0tDTWrVvH3/72N6ZMmUJcXFypccfGxrJixQqSk5PL9ToTEhJITU11b2vWrCEqKooePXoAsH79evr27cv111/PL7/8wmeffcbSpUu55557yvX4IiLieQ7Lsiy7gxAREf922223cezYMd59913q16/Pli1bcDgctGrVit27d/PHP/6RyMjIM14jdPDgQWJiYli/fj1t27Zl165dNG7cmDVr1tC+fftiz5OUlERKSgrBwcHu440aNWLcuHGMGzcOMFP4OnfuTPPmzdm4cSNdunRhypQpZ4w/NTWV66+/nhUrVtCiRQu6dOnCgAED+MMf/oDT6Sz2GmfNmlXsvllZWfTq1Ys6derw5Zdf4nQ6GTlyJNWqVePtt992t1u6dCk9e/bkxIkThIaGntsbLCIilU4jTiIi4jHR0dEMHDiQDz/8kKlTpzJw4ECio6NLtNuxYwfDhw+nSZMmhIeHu6fmpaSklPkc7dq1K5Y0lSY4OJhp06YxY8YMTp48WWbFvbi4OJYvX8769esZO3Ysubm5jBo1in79+rlHwc7kzjvvJCMjg08++cSdZK1evZoPPviAGjVquLe+ffvicrnYuXNnma9RREQ8L9DuAEREpGq544473FPS3nzzzVLbDB48mISEBKZMmUJ8fDwul4u2bdu6iz2cTfXq1csVx7JlywA4cuQIR44cKdf92rZtS9u2bRkzZgxLly6le/fuLF68mCuvvLLU9s8++yxJSUmsXLmSmjVruo+7XC7+8pe/MHbs2BL3qaxCGSIicmGUOImIiEf169fPnQD17du3xPnDhw+zefNm3n77bbp37w6YaWxFFYwo5efnn1cMO3bs4P7772fKlCl8/vnnjBw50n0tVXklJiYCnLGgw4wZM3j66af5+uuvadq0abFzl1xyCRs3bqRZs2bnFb+IiHieEicREfGogIAANm/e7N4/Xa1atYiKiuKdd94hLi6OlJQUHn744WJtYmJiqFatGklJSdSvX5/Q0FAiIiLK9fz5+fmMGDGCPn36cPvtt9O/f3/atWvHyy+/zIMPPljqfe6++27i4+O56qqrqF+/PqmpqTz77LPUqVOHLl26lGi/YcMGRo4cyYQJE2jTpg379u0DTMJXu3ZtJkyYQOfOnRkzZgx/+tOfqF69urugxeuvv16u1yEiIp6la5xERMTjwsPDCQ8PL/Wc0+lk+vTprF69mrZt23L//ffz0ksvFWsTGBjIa6+9xttvv018fDxDhw4t93M/99xz7Nq1i3feeQcwFfPeffddHnvssTMuqHvNNdewYsUKhg0bRosWLbjhhhsIDQ1l4cKFREVFlWj/008/kZmZybPPPktcXJx7u/766wG46KKLWLx4Mdu3b6d79+506NCBxx9//IxV/URExH6qqiciIiIiIlIGjTiJiIiIiIiUQYmTiIiIiIhIGZQ4iYiIiIiIlEGJk4iIiIiISBmUOImIiIiIiJRBiZOIiIiIiEgZlDiJiIiIiIiUQYmTiIiIiIhIGZQ4iYiIiIiIlEGJk4iIiIiISBmUOImIiIiIiJTh/wEdIQTjLDE0GAAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import tensorflow as tf\n", + "import time\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Configuration\n", + "matrix_sizes = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]\n", + "repeats = 10\n", + "\n", + "cpu_times = []\n", + "gpu_times = []\n", + "\n", + "for matrix_size in matrix_sizes:\n", + " cpu_avg_time = 0\n", + " gpu_avg_time = 0\n", + "\n", + " for _ in range(repeats):\n", + " # Create random matrices\n", + " matrix_a = tf.random.normal(shape=(matrix_size, matrix_size))\n", + " matrix_b = tf.random.normal(shape=(matrix_size, matrix_size))\n", + "\n", + " # CPU matrix multiplication\n", + " start_time = time.time()\n", + " result_cpu = tf.matmul(matrix_a, matrix_b)\n", + " end_time = time.time()\n", + " cpu_avg_time += (end_time - start_time) / repeats\n", + "\n", + " # GPU matrix multiplication\n", + " with tf.device('/GPU:0'):\n", + " start_time = time.time()\n", + " result_gpu = tf.matmul(matrix_a, matrix_b)\n", + " end_time = time.time()\n", + " gpu_avg_time += (end_time - start_time) / repeats\n", + "\n", + " cpu_times.append(cpu_avg_time)\n", + " gpu_times.append(gpu_avg_time)\n", + "\n", + "# Plot results\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(matrix_sizes, cpu_times, marker='o', label='CPU')\n", + "plt.plot(matrix_sizes, gpu_times, marker='o', label='GPU')\n", + "plt.xlabel('Matrix Size')\n", + "plt.ylabel('Average Time (s)')\n", + "plt.title('Matrix Multiplication Performance: CPU vs. GPU')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d3c81d9b-2fb4-4bc0-8648-149713f86451", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /tmp/ipykernel_304/2684439677.py:8: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use `tf.config.list_physical_devices('GPU')` instead.\n", + "TensorFlow is using GPU: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-10-01 19:19:51.595972: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2023-10-01 19:19:51.596220: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2023-10-01 19:19:51.596371: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2023-10-01 19:19:51.596572: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2023-10-01 19:19:51.596727: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", + "2023-10-01 19:19:51.596848: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 20332 MB memory: -> device: 0, name: NVIDIA A10G, pci bus id: 0000:00:1e.0, compute capability: 8.6\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/50\n", + "8/8 [==============================] - 1s 2ms/step - loss: 1.0704 - accuracy: 0.5333\n", + "Epoch 2/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 1.0321 - accuracy: 0.5667\n", + "Epoch 3/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.9970 - accuracy: 0.6000\n", + "Epoch 4/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.9635 - accuracy: 0.6167\n", + "Epoch 5/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.9300 - accuracy: 0.6333\n", + "Epoch 6/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.9011 - accuracy: 0.6583\n", + "Epoch 7/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.8727 - accuracy: 0.6667\n", + "Epoch 8/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.8462 - accuracy: 0.6667\n", + "Epoch 9/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.8207 - accuracy: 0.6750\n", + "Epoch 10/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.7969 - accuracy: 0.7000\n", + "Epoch 11/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.7724 - accuracy: 0.7000\n", + "Epoch 12/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.7503 - accuracy: 0.7000\n", + "Epoch 13/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.7289 - accuracy: 0.7000\n", + "Epoch 14/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.7086 - accuracy: 0.6917\n", + "Epoch 15/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6889 - accuracy: 0.7000\n", + "Epoch 16/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6708 - accuracy: 0.7083\n", + "Epoch 17/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6533 - accuracy: 0.7167\n", + "Epoch 18/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6362 - accuracy: 0.7083\n", + "Epoch 19/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6203 - accuracy: 0.7167\n", + "Epoch 20/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.6051 - accuracy: 0.7083\n", + "Epoch 21/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5899 - accuracy: 0.7083\n", + "Epoch 22/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5758 - accuracy: 0.7250\n", + "Epoch 23/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5616 - accuracy: 0.7583\n", + "Epoch 24/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5487 - accuracy: 0.7583\n", + "Epoch 25/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5350 - accuracy: 0.7917\n", + "Epoch 26/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5227 - accuracy: 0.8000\n", + "Epoch 27/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.5104 - accuracy: 0.8000\n", + "Epoch 28/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4991 - accuracy: 0.8167\n", + "Epoch 29/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4883 - accuracy: 0.8333\n", + "Epoch 30/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4782 - accuracy: 0.8500\n", + "Epoch 31/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4684 - accuracy: 0.8750\n", + "Epoch 32/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4586 - accuracy: 0.8750\n", + "Epoch 33/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4491 - accuracy: 0.8750\n", + "Epoch 34/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4402 - accuracy: 0.8750\n", + "Epoch 35/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4317 - accuracy: 0.8667\n", + "Epoch 36/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4229 - accuracy: 0.8833\n", + "Epoch 37/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4150 - accuracy: 0.8750\n", + "Epoch 38/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.4070 - accuracy: 0.8750\n", + "Epoch 39/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3998 - accuracy: 0.8917\n", + "Epoch 40/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3923 - accuracy: 0.9083\n", + "Epoch 41/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3854 - accuracy: 0.9000\n", + "Epoch 42/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3786 - accuracy: 0.9000\n", + "Epoch 43/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3723 - accuracy: 0.9000\n", + "Epoch 44/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3665 - accuracy: 0.8917\n", + "Epoch 45/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3600 - accuracy: 0.8917\n", + "Epoch 46/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3541 - accuracy: 0.8917\n", + "Epoch 47/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3485 - accuracy: 0.8917\n", + "Epoch 48/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3430 - accuracy: 0.8917\n", + "Epoch 49/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3379 - accuracy: 0.8917\n", + "Epoch 50/50\n", + "8/8 [==============================] - 0s 2ms/step - loss: 0.3326 - accuracy: 0.8917\n", + "1/1 [==============================] - 0s 107ms/step - loss: 0.2827 - accuracy: 0.9333\n", + "Test Accuracy: 0.9333333373069763\n", + "1/1 [==============================] - 0s 56ms/step\n", + "Test Accuracy (calculated): 0.9333333333333333\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Check if TensorFlow is using the GPU\n", + "print(\"TensorFlow is using GPU:\", tf.test.is_gpu_available())\n", + "\n", + "# Load Iris dataset\n", + "iris = load_iris()\n", + "X, y = iris.data, iris.target\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Scale the data to have zero mean and unit variance\n", + "scaler = StandardScaler()\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_test = scaler.transform(X_test)\n", + "\n", + "# Define the neural network model\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(10, activation='relu', input_shape=(4,)),\n", + " tf.keras.layers.Dense(3, activation='softmax')\n", + "])\n", + "\n", + "# Compile the model\n", + "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + "\n", + "# Train the model\n", + "epochs = 50\n", + "batch_size = 16\n", + "model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)\n", + "\n", + "# Evaluate the model on the test set\n", + "loss, accuracy = model.evaluate(X_test, y_test)\n", + "print(\"Test Accuracy:\", accuracy)\n", + "\n", + "# Make predictions on the test set\n", + "y_pred_probabilities = model.predict(X_test)\n", + "y_pred = tf.argmax(y_pred_probabilities, axis=1)\n", + "\n", + "# Calculate and print the accuracy on the test set\n", + "test_accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Test Accuracy (calculated):\", test_accuracy)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36ff103-e072-4494-b51f-b42540f0c585", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai-ml/jupyterhub/examples/notebook-examples/microbenchmark-pytorch-neurox.ipynb b/ai-ml/jupyterhub/examples/notebook-examples/microbenchmark-pytorch-neurox.ipynb new file mode 100644 index 000000000..3dda2ea05 --- /dev/null +++ b/ai-ml/jupyterhub/examples/notebook-examples/microbenchmark-pytorch-neurox.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Microbenchmarking Neuron Devices (Trn1/Inf2)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This guide reviews the best practices for benchmarking performance of Neuron devices. It shows how to separate compilation and execution time, how to isolate the device time from the end-to-end execution time, how to warm-up the device, and covers few pitfalls one should be aware of. This guide provides an example code, in PyTorch, that can be used as a template for measuring performance.\n", + "\n", + "This Jupyter notebook should be run on a Trn1/Inf2 instance (trn1.2xlarge/inf2.xlarge or larger)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verify that this Jupyter notebook is running the Python kernel environment that was set up according to the [PyTorch Installation Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). You can select the kernel from the 'Kernel -> Change Kernel' option on the top of this Jupyter notebook page." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example\n", + "\n", + "As a motivating example, assume we would like to measure the max throughput of the device when executing matrix multiplication:\n", + "\n", + "`nn.Linear(in_features=n, out_features=n, bias=is_add_bias)`\n", + "\n", + "Note that nn.Linear can add bias; we will touch on that part later.\n", + "\n", + "First we will parametrize the microbenchmark run as follows (those parameters can be modified as needed):\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Matrix multiplication of size [BATCH_SIZE, MATRIX_DIM, MATRIX_DIM]x[BATCH_SIZE, MATRIX_DIM, MATRIX_DIM]\n", + "BATCH_SIZE = 1\n", + "MATRIX_DIM = 1024\n", + "# How many times matrix multiplication is ran in a single loop (recommend using a large number to amortize runtime and framework overheads)\n", + "LOOP_COUNT = 1000\n", + "# Number of timed iterations (recommend using a large number to filter noise)\n", + "N_TIMED_ITERATIONS = 1000\n", + "# Add bias after matrix multiplication (recommended for numerical stability)\n", + "ADD_BIAS = True\n", + "# Additional flags to pass to the compiler\n", + "NEURON_CC_FLAGS = \"\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We recommend adding bias for numerical stability (avoiding NaNs in computation). Numerical issues are reported back to the user, which can slow down total runtime. For best performance use large matrix sizes (for high utilization), and large loop/iteration counts (to minimize overheads)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initial Version\n", + "\n", + "Let’s write a simple Module that will exercise the Linear layer in a loop (see below). We want to repeat the computation to amortize overheads." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "\n", + "@torch.no_grad()\n", + "class Matmult(nn.Module):\n", + "\n", + " def __init__(self, n, is_add_bias, loop_count):\n", + " super().__init__()\n", + " self.loop_count = loop_count\n", + " self.matmult = nn.Linear(in_features=n, out_features=n, bias=is_add_bias)\n", + "\n", + " def forward(self, x):\n", + " out = self.matmult(x)\n", + " for i in range(1, self.loop_count):\n", + " out = self.matmult(out)\n", + " return out.mean()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we feed the result of the previous matmult to the current one. This is done to make sure we use the result from each matrix multiplication. If, for example, we would have tried to simply repeat the same computation inside the loop, the compiler would have optimized all but the last iteration out:\n", + "\n", + "```\n", + " def forward(self, x):\n", + " input = x\n", + " for i in range(0, self.loop_count):\n", + " out = self.matmult(input) \n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Counting time\n", + "\n", + "Make sure to use a sufficiently-granular counter. We recommend using time.perf_counter, which uses the clock with the highest available resolution. The Neuron microbenchmark samples, contains a simple utility that is adequate for perf timing. Using the timer class, we can decorate the code to measure runtime of each section." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import ubench_utils" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using PyTorch-Neuron trace\n", + "There are two methods to instantiate execution on neuron devices: (1) using [Neuron XLA device API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.html), and (2) using [PyTorch-Neuron trace API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/api-compilation-python-api.html). For benchmarking, we prefer using the PyTorch-Neuron trace, because it introduces minimal runtime and application overheads (see explanation of the [Lazy mode](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.html#understand-the-lazy-mode-in-pytorch-neuron) operation of Neuron XLA)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import torch_neuronx\n", + "\n", + "# Create the model\n", + "model = Matmult(MATRIX_DIM, ADD_BIAS, LOOP_COUNT)\n", + "# Create sample input\n", + "matrix_cpu = torch.randn([BATCH_SIZE, MATRIX_DIM, MATRIX_DIM], dtype=torch.float32)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "PyTorch-Neuron trace also makes it easy to separate compilation:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-09-26T18:06:56Z Compilation is optimized for best performance and compilation time. For faster compilation time please use -O1\n", + "2023-09-26T18:06:58Z Running DoNothing\n", + "2023-09-26T18:06:58Z DoNothing finished after 0.000 seconds\n", + "2023-09-26T18:06:58Z Running CanonicalizeIR\n", + "2023-09-26T18:06:58Z CanonicalizeIR finished after 0.057 seconds\n", + "2023-09-26T18:06:58Z Running LegalizeCCOpLayout\n", + "2023-09-26T18:06:58Z LegalizeCCOpLayout finished after 0.071 seconds\n", + "2023-09-26T18:06:58Z Running ExpandBatchNorm\n", + "2023-09-26T18:06:58Z ExpandBatchNorm finished after 0.073 seconds\n", + "2023-09-26T18:06:58Z Running ResolveComplicatePredicates\n", + "2023-09-26T18:06:58Z ResolveComplicatePredicates finished after 0.061 seconds\n", + "2023-09-26T18:06:58Z Running AffinePredicateResolution\n", + "2023-09-26T18:06:58Z AffinePredicateResolution finished after 0.068 seconds\n", + "2023-09-26T18:06:58Z Running EliminateDivs\n", + "2023-09-26T18:06:58Z EliminateDivs finished after 0.060 seconds\n", + "2023-09-26T18:06:58Z Running PerfectLoopNest\n", + "2023-09-26T18:06:58Z PerfectLoopNest finished after 0.064 seconds\n", + "2023-09-26T18:06:58Z Running Simplifier\n", + "2023-09-26T18:06:59Z Simplifier finished after 0.304 seconds\n", + "2023-09-26T18:06:59Z Running GenericAccessSimplifier\n", + "2023-09-26T18:06:59Z GenericAccessSimplifier finished after 0.068 seconds\n", + "2023-09-26T18:06:59Z Running TCTransform\n", + "2023-09-26T18:06:59Z TCTransform finished after 0.179 seconds\n", + "2023-09-26T18:06:59Z Running CommuteConcat\n", + "2023-09-26T18:06:59Z CommuteConcat finished after 0.062 seconds\n", + "2023-09-26T18:06:59Z Running TensorOpFusion\n", + "2023-09-26T18:06:59Z TensorOpFusion finished after 0.071 seconds\n", + "2023-09-26T18:06:59Z Running TensorOpTransform\n", + "2023-09-26T18:07:01Z TensorOpTransform finished after 2.276 seconds\n", + "2023-09-26T18:07:01Z Running LowerTensorOp\n", + "2023-09-26T18:07:02Z LowerTensorOp finished after 0.897 seconds\n", + "2023-09-26T18:07:02Z Running CanonicalizeIR\n", + "2023-09-26T18:07:02Z CanonicalizeIR finished after 0.107 seconds\n", + "2023-09-26T18:07:02Z Running MemcpyElimination\n", + "2023-09-26T18:07:12Z MemcpyElimination finished after 9.112 seconds\n", + "2023-09-26T18:07:12Z Running LoopFusion\n", + "2023-09-26T18:07:14Z LoopFusion finished after 2.397 seconds\n", + "2023-09-26T18:07:14Z Running Simplifier\n", + "2023-09-26T18:07:15Z Simplifier finished after 0.543 seconds\n", + "2023-09-26T18:07:15Z Running Delinearization\n", + "2023-09-26T18:07:15Z Delinearization finished after 0.056 seconds\n", + "2023-09-26T18:07:15Z Running DeadStoreElimination\n", + "2023-09-26T18:07:23Z DeadStoreElimination finished after 8.127 seconds\n", + "2023-09-26T18:07:23Z Running Simplifier\n", + "2023-09-26T18:07:23Z Simplifier finished after 0.539 seconds\n", + "2023-09-26T18:07:23Z Running LICM\n", + "2023-09-26T18:07:23Z LICM finished after 0.119 seconds\n", + "2023-09-26T18:07:23Z Running Delinearization\n", + "2023-09-26T18:07:23Z Delinearization finished after 0.055 seconds\n", + "2023-09-26T18:07:23Z Running LoopFusion\n", + "2023-09-26T18:07:24Z LoopFusion finished after 0.065 seconds\n", + "2023-09-26T18:07:24Z Running SimplifySlice\n", + "2023-09-26T18:07:24Z SimplifySlice finished after 0.050 seconds\n", + "2023-09-26T18:07:24Z Running LICM\n", + "2023-09-26T18:07:24Z LICM finished after 0.078 seconds\n", + "2023-09-26T18:07:24Z Running Simplifier\n", + "2023-09-26T18:07:25Z Simplifier finished after 1.097 seconds\n", + "2023-09-26T18:07:25Z Running ValueNumbering\n", + "2023-09-26T18:07:25Z ValueNumbering finished after 0.117 seconds\n", + "2023-09-26T18:07:25Z Running LICM\n", + "2023-09-26T18:07:25Z LICM finished after 0.078 seconds\n", + "2023-09-26T18:07:25Z Running PadElimination\n", + "2023-09-26T18:07:25Z PadElimination finished after 0.004 seconds\n", + "2023-09-26T18:07:25Z Running Delinearization\n", + "2023-09-26T18:07:25Z Delinearization finished after 0.055 seconds\n", + "2023-09-26T18:07:25Z Running LoopFusion\n", + "2023-09-26T18:07:25Z LoopFusion finished after 0.065 seconds\n", + "2023-09-26T18:07:25Z Running GenericAccessSimplifier\n", + "2023-09-26T18:07:25Z GenericAccessSimplifier finished after 0.053 seconds\n", + "2023-09-26T18:07:25Z Running Simplifier\n", + "2023-09-26T18:07:26Z Simplifier finished after 0.547 seconds\n", + "2023-09-26T18:07:26Z Running LICM\n", + "2023-09-26T18:07:26Z LICM finished after 0.077 seconds\n", + "2023-09-26T18:07:26Z Running ValueNumbering\n", + "2023-09-26T18:07:26Z ValueNumbering finished after 0.102 seconds\n", + "2023-09-26T18:07:26Z Running TCTransform\n", + "2023-09-26T18:07:26Z TCTransform finished after 0.048 seconds\n", + "2023-09-26T18:07:26Z Running CommuteConcat\n", + "2023-09-26T18:07:26Z CommuteConcat finished after 0.052 seconds\n", + "2023-09-26T18:07:26Z Running RecognizeOpIdiom\n", + "2023-09-26T18:07:26Z RecognizeOpIdiom finished after 0.274 seconds\n", + "2023-09-26T18:07:26Z Running MaskPropagation\n", + "2023-09-26T18:07:26Z MaskPropagation finished after 0.176 seconds\n", + "2023-09-26T18:07:26Z Running Recompute\n", + "2023-09-26T18:07:26Z Recompute finished after 0.007 seconds\n", + "2023-09-26T18:07:26Z Running DeadCodeElimination\n", + "2023-09-26T18:07:27Z DeadCodeElimination finished after 0.056 seconds\n", + "2023-09-26T18:07:27Z Running DoNothing\n", + "2023-09-26T18:07:27Z DoNothing finished after 0.000 seconds\n", + "2023-09-26T18:07:27Z Running MutateDataType\n", + "2023-09-26T18:07:27Z MutateDataType finished after 0.038 seconds\n", + "2023-09-26T18:07:27Z Running AutoCastTCInputs\n", + "2023-09-26T18:07:27Z AutoCastTCInputs finished after 0.451 seconds\n", + "2023-09-26T18:07:27Z Running GenericAccessSimplifier\n", + "2023-09-26T18:07:27Z GenericAccessSimplifier finished after 0.067 seconds\n", + "2023-09-26T18:07:27Z Running Simplifier\n", + "2023-09-26T18:07:28Z Simplifier finished after 0.612 seconds\n", + "2023-09-26T18:07:28Z Running DelinearIndices\n", + "2023-09-26T18:07:28Z DelinearIndices finished after 0.128 seconds\n", + "2023-09-26T18:07:28Z Running Delinearization\n", + "2023-09-26T18:07:28Z Delinearization finished after 0.054 seconds\n", + "2023-09-26T18:07:28Z Running DelinearIndices\n", + "2023-09-26T18:07:28Z DelinearIndices finished after 0.124 seconds\n", + "2023-09-26T18:07:28Z Running DeadCodeElimination\n", + "2023-09-26T18:07:28Z DeadCodeElimination finished after 0.072 seconds\n", + "2023-09-26T18:07:28Z Running InferIntrinsicOnCC\n", + "2023-09-26T18:07:28Z InferIntrinsicOnCC finished after 0.226 seconds\n", + "2023-09-26T18:07:28Z Running ResolveAccessConflict\n", + "2023-09-26T18:07:29Z ResolveAccessConflict finished after 0.184 seconds\n", + "2023-09-26T18:07:29Z Running LICM\n", + "2023-09-26T18:07:29Z LICM finished after 0.101 seconds\n", + "2023-09-26T18:07:29Z Running LocalLayoutOpt\n", + "2023-09-26T18:07:29Z LocalLayoutOpt finished after 0.242 seconds\n", + "2023-09-26T18:07:29Z Running DelinearIndices\n", + "2023-09-26T18:07:29Z DelinearIndices finished after 0.129 seconds\n", + "2023-09-26T18:07:29Z Running OrigLayoutTilingPipeline\n", + "2023-09-26T18:07:29Z Running GlobalLayoutOpt\n", + "2023-09-26T18:07:30Z GlobalLayoutOpt finished after 0.792 seconds\n", + "2023-09-26T18:07:30Z Running CanonicalizeDAG\n", + "2023-09-26T18:07:30Z CanonicalizeDAG finished after 0.086 seconds\n", + "2023-09-26T18:07:30Z Running FlattenAxesForTiling\n", + "2023-09-26T18:07:30Z FlattenAxesForTiling finished after 0.016 seconds\n", + "2023-09-26T18:07:30Z Running SundaSizeTiling\n", + "2023-09-26T18:07:36Z SundaSizeTiling finished after 5.869 seconds\n", + "2023-09-26T18:07:36Z OrigLayoutTilingPipeline finished after 6.777 seconds\n", + "2023-09-26T18:07:36Z Running TilingProfiler\n", + "2023-09-26T18:07:36Z TilingProfiler finished after 0.343 seconds\n", + "2023-09-26T18:07:36Z Running FlattenMacroLoop\n", + "2023-09-26T18:07:36Z FlattenMacroLoop finished after 0.071 seconds\n", + "2023-09-26T18:07:36Z Running InferTongaTensor\n", + "2023-09-26T18:07:38Z InferTongaTensor finished after 1.729 seconds\n", + "2023-09-26T18:07:38Z Running TongaSimplifier\n", + "2023-09-26T18:07:39Z TongaSimplifier finished after 1.406 seconds\n", + "2023-09-26T18:07:39Z Running LICM\n", + "2023-09-26T18:07:40Z LICM finished after 0.209 seconds\n", + "2023-09-26T18:07:40Z Running RewriteReplicationMatmul\n", + "2023-09-26T18:07:40Z RewriteReplicationMatmul finished after 0.094 seconds\n", + "2023-09-26T18:07:40Z Running FlattenMacroLoop\n", + "2023-09-26T18:07:40Z FlattenMacroLoop finished after 0.081 seconds\n", + "2023-09-26T18:07:40Z Running SimplifyMacroPredicates\n", + "2023-09-26T18:07:41Z SimplifyMacroPredicates finished after 1.093 seconds\n", + "2023-09-26T18:07:41Z Running DataLocalityOpt\n", + "2023-09-26T18:08:00Z DataLocalityOpt finished after 19.166 seconds\n", + "2023-09-26T18:08:00Z Running TongaSimplifier\n", + "2023-09-26T18:08:01Z TongaSimplifier finished after 0.713 seconds\n", + "2023-09-26T18:08:01Z Running LegalizeSundaMacro\n", + "2023-09-26T18:08:01Z LegalizeSundaMacro finished after 0.489 seconds\n", + "2023-09-26T18:08:01Z Running TongaSimplifier\n", + "2023-09-26T18:08:02Z TongaSimplifier finished after 0.708 seconds\n", + "2023-09-26T18:08:02Z Running PerfectLoopNest\n", + "2023-09-26T18:08:02Z PerfectLoopNest finished after 0.181 seconds\n", + "2023-09-26T18:08:02Z Running FlattenMacroLoop\n", + "2023-09-26T18:08:05Z FlattenMacroLoop finished after 2.908 seconds\n", + "2023-09-26T18:08:05Z Running RewriteWeights\n", + "2023-09-26T18:08:06Z RewriteWeights finished after 0.470 seconds\n", + "2023-09-26T18:08:06Z Running ReshapeWeights\n", + "2023-09-26T18:08:06Z ReshapeWeights finished after 0.059 seconds\n", + "2023-09-26T18:08:06Z Running FlattenMacroLoop\n", + "2023-09-26T18:08:06Z FlattenMacroLoop finished after 0.136 seconds\n", + "2023-09-26T18:08:06Z Running SimplifyMacroPredicates\n", + "2023-09-26T18:08:08Z SimplifyMacroPredicates finished after 1.793 seconds\n", + "2023-09-26T18:08:08Z Running InferInitValue\n", + "2023-09-26T18:08:16Z InferInitValue finished after 8.969 seconds\n", + "2023-09-26T18:08:16Z Running TongaSimplifier\n", + "2023-09-26T18:08:17Z TongaSimplifier finished after 0.687 seconds\n", + "2023-09-26T18:08:17Z Running SimplifyTensor\n", + "2023-09-26T18:08:18Z SimplifyTensor finished after 1.308 seconds\n", + "2023-09-26T18:08:18Z Running LICM\n", + "2023-09-26T18:08:19Z LICM finished after 0.276 seconds\n", + "2023-09-26T18:08:19Z Running SundaISel\n", + "2023-09-26T18:08:25Z SundaISel finished after 5.793 seconds\n", + "2023-09-26T18:08:25Z Running TongaLoopInterchange\n", + "2023-09-26T18:08:25Z TongaLoopInterchange finished after 0.224 seconds\n", + "2023-09-26T18:08:25Z Running TongaSimplifyPredicates\n", + "2023-09-26T18:08:25Z TongaSimplifyPredicates finished after 0.080 seconds\n", + "2023-09-26T18:08:25Z Running TongaLoopFusion\n", + "2023-09-26T18:08:26Z TongaLoopFusion finished after 0.963 seconds\n", + "2023-09-26T18:08:26Z Running TongaLoopInterchange\n", + "2023-09-26T18:08:26Z TongaLoopInterchange finished after 0.078 seconds\n", + "2023-09-26T18:08:26Z Running TongaLICM\n", + "2023-09-26T18:08:26Z TongaLICM finished after 0.251 seconds\n", + "2023-09-26T18:08:26Z Running FactorizeBlkDims\n", + "2023-09-26T18:08:27Z FactorizeBlkDims finished after 0.506 seconds\n", + "2023-09-26T18:08:27Z Running TongaInstComb\n", + "2023-09-26T18:08:28Z TongaInstComb finished after 1.019 seconds\n", + "2023-09-26T18:08:28Z Running TongaValueNumbering\n", + "2023-09-26T18:08:28Z TongaValueNumbering finished after 0.586 seconds\n", + "2023-09-26T18:08:28Z Running TongaInstComb\n", + "2023-09-26T18:08:28Z TongaInstComb finished after 0.099 seconds\n", + "2023-09-26T18:08:28Z Running VectorizeDMA\n", + "2023-09-26T18:08:28Z VectorizeDMA finished after 0.038 seconds\n", + "2023-09-26T18:08:28Z Running TongaSimplifyPredicates\n", + "2023-09-26T18:08:28Z TongaSimplifyPredicates finished after 0.036 seconds\n", + "2023-09-26T18:08:28Z Running LegalizePartitionReduce\n", + "2023-09-26T18:08:29Z LegalizePartitionReduce finished after 0.043 seconds\n", + "2023-09-26T18:08:29Z Running DeConcat\n", + "2023-09-26T18:08:29Z DeConcat finished after 0.812 seconds\n", + "2023-09-26T18:08:29Z Running PartialSimdFusion\n", + "2023-09-26T18:08:30Z PartialSimdFusion finished after 0.693 seconds\n", + "2023-09-26T18:08:30Z Running TritiumFusion\n", + "2023-09-26T18:09:14Z TritiumFusion finished after 44.188 seconds\n", + "2023-09-26T18:09:14Z Running CCOpFusion\n", + "2023-09-26T18:09:16Z CCOpFusion finished after 1.418 seconds\n", + "2023-09-26T18:09:16Z Running VectorizeMatMult\n", + "2023-09-26T18:10:20Z VectorizeMatMult finished after 64.772 seconds\n", + "2023-09-26T18:10:20Z Running PartialLoopFusion\n", + "2023-09-26T18:10:24Z PartialLoopFusion finished after 3.691 seconds\n", + "2023-09-26T18:10:24Z Running TongaLICM\n", + "2023-09-26T18:10:24Z TongaLICM finished after 0.145 seconds\n", + "2023-09-26T18:10:24Z Running LowerTranspose\n", + "2023-09-26T18:10:24Z LowerTranspose finished after 0.043 seconds\n", + "2023-09-26T18:10:24Z Running LateTongaInstComb\n", + "2023-09-26T18:10:24Z LateTongaInstComb finished after 0.117 seconds\n", + "2023-09-26T18:10:24Z Running SplitAccGrp\n", + "2023-09-26T18:10:24Z SplitAccGrp finished after 0.039 seconds\n", + "2023-09-26T18:10:24Z Running SpillPSum\n", + "2023-09-26T18:10:25Z SpillPSum finished after 0.577 seconds\n", + "2023-09-26T18:10:25Z Running LowerIntrinsics\n", + "2023-09-26T18:10:25Z LowerIntrinsics finished after 0.035 seconds\n", + "2023-09-26T18:10:25Z Running LegalizeType\n", + "2023-09-26T18:10:25Z LegalizeType finished after 0.053 seconds\n", + "2023-09-26T18:10:25Z Running TongaLICM\n", + "2023-09-26T18:10:25Z TongaLICM finished after 0.142 seconds\n", + "2023-09-26T18:10:25Z Running InferPSumTensor\n", + "2023-09-26T18:10:26Z InferPSumTensor finished after 0.450 seconds\n", + "2023-09-26T18:10:26Z Running WeightCoalescing\n", + "2023-09-26T18:10:26Z WeightCoalescing finished after 0.036 seconds\n", + "2023-09-26T18:10:26Z Running LegalizeSundaAccess\n", + "2023-09-26T18:10:26Z LegalizeSundaAccess finished after 0.211 seconds\n", + "2023-09-26T18:10:26Z Running RelaxPredicates\n", + "2023-09-26T18:10:26Z RelaxPredicates finished after 0.077 seconds\n", + "2023-09-26T18:10:26Z Running TensorInitialization\n", + "2023-09-26T18:10:26Z TensorInitialization finished after 0.030 seconds\n", + "2023-09-26T18:10:26Z Running TongaSimplifyPredicates\n", + "2023-09-26T18:10:26Z TongaSimplifyPredicates finished after 0.031 seconds\n", + "2023-09-26T18:10:26Z Running ExpandISAMacro\n", + "2023-09-26T18:10:26Z ExpandISAMacro finished after 0.036 seconds\n", + "2023-09-26T18:10:26Z Running SimplifyTongaTensor\n", + "2023-09-26T18:10:26Z SimplifyTongaTensor finished after 0.169 seconds\n", + "2023-09-26T18:10:26Z Running DMALocalityOpt\n", + "2023-09-26T18:10:26Z DMALocalityOpt finished after 0.015 seconds\n", + "2023-09-26T18:10:26Z Running DataStreaming\n", + "2023-09-26T18:10:27Z DataStreaming finished after 0.169 seconds\n", + "2023-09-26T18:10:27Z Running SFKVectorizer\n", + "2023-09-26T18:10:38Z SFKVectorizer finished after 11.174 seconds\n", + "2023-09-26T18:10:38Z Running LateLegalizeInst\n", + "2023-09-26T18:10:38Z LateLegalizeInst finished after 0.052 seconds\n", + "2023-09-26T18:10:38Z Running CoalesceCCOp\n", + "2023-09-26T18:10:38Z CoalesceCCOp finished after 0.051 seconds\n", + "2023-09-26T18:10:38Z Running SimpleAllReduceTiling\n", + "2023-09-26T18:10:38Z SimpleAllReduceTiling finished after 0.050 seconds\n", + "2023-09-26T18:10:38Z Running StaticProfiler\n", + "2023-09-26T18:10:38Z StaticProfiler finished after 0.124 seconds\n", + "2023-09-26T18:10:38Z Running SplitAPUnionSets\n", + "2023-09-26T18:10:38Z SplitAPUnionSets finished after 0.069 seconds\n", + "2023-09-26T18:10:38Z Running DumpGraphAndMetadata\n", + "2023-09-26T18:10:38Z DumpGraphAndMetadata finished after 0.116 seconds\n", + "2023-09-26T18:10:38Z Running BirCodeGenLoop\n", + "2023-09-26T18:10:39Z BirCodeGenLoop finished after 1.000 seconds\n", + "2023-09-26T18:10:43Z Running birverifier\n", + "2023-09-26T18:10:44Z birverifier finished after 0.262 seconds\n", + "2023-09-26T18:10:44Z Running expand_replication\n", + "2023-09-26T18:10:44Z expand_replication finished after 0.001 seconds\n", + "2023-09-26T18:10:44Z Running unroll\n", + "2023-09-26T18:10:45Z unroll finished after 1.000 seconds\n", + "2023-09-26T18:10:45Z Running error_injector\n", + "2023-09-26T18:10:45Z error_injector finished after 0.001 seconds\n", + "2023-09-26T18:10:45Z Running constant_propagate\n", + "2023-09-26T18:10:45Z constant_propagate finished after 0.208 seconds\n", + "2023-09-26T18:10:45Z Running vn_splitter\n", + "2023-09-26T18:10:45Z vn_splitter finished after 0.107 seconds\n", + "2023-09-26T18:10:45Z Running lower_ac\n", + "2023-09-26T18:10:45Z lower_ac finished after 0.021 seconds\n", + "2023-09-26T18:10:45Z Running pre_sched\n", + "2023-09-26T18:10:46Z pre_sched finished after 0.881 seconds\n", + "2023-09-26T18:10:46Z Running mm_packing\n", + "2023-09-26T18:10:46Z mm_packing finished after 0.198 seconds\n", + "2023-09-26T18:10:46Z Running coloring_allocator_psum\n", + "2023-09-26T18:10:47Z coloring_allocator_psum finished after 0.592 seconds\n", + "2023-09-26T18:10:47Z Running dma_optimization_psum\n", + "2023-09-26T18:10:47Z dma_optimization_psum finished after 0.132 seconds\n", + "2023-09-26T18:10:47Z Running address_rotation_psum\n", + "2023-09-26T18:10:47Z address_rotation_psum finished after 0.522 seconds\n", + "2023-09-26T18:10:47Z Running coloring_allocator_sb\n", + "2023-09-26T18:10:48Z coloring_allocator_sb finished after 0.663 seconds\n", + "2023-09-26T18:10:48Z Running dma_optimization_sb\n", + "2023-09-26T18:10:49Z dma_optimization_sb finished after 0.714 seconds\n", + "2023-09-26T18:10:49Z Running address_rotation_sb\n", + "2023-09-26T18:10:49Z address_rotation_sb finished after 0.703 seconds\n", + "2023-09-26T18:10:49Z Running coloring_allocator_dram\n", + "2023-09-26T18:10:50Z coloring_allocator_dram finished after 0.325 seconds\n", + "2023-09-26T18:10:50Z Running address_rotation_dram\n", + "2023-09-26T18:10:50Z address_rotation_dram finished after 0.115 seconds\n", + "2023-09-26T18:10:50Z Running tensorcopy_accel\n", + "2023-09-26T18:10:50Z tensorcopy_accel finished after 0.012 seconds\n", + "2023-09-26T18:10:50Z Running peephole_opts\n", + "2023-09-26T18:10:50Z peephole_opts finished after 0.153 seconds\n", + "2023-09-26T18:10:50Z Running lower_kernel\n", + "2023-09-26T18:10:50Z lower_kernel finished after 0.016 seconds\n", + "2023-09-26T18:10:50Z Running build_fdeps\n", + "2023-09-26T18:10:50Z build_fdeps finished after 0.306 seconds\n", + "2023-09-26T18:10:50Z Running remove_redundancies\n", + "2023-09-26T18:10:50Z remove_redundancies finished after 0.075 seconds\n", + "2023-09-26T18:10:50Z Running anti_dependency_analyzer\n", + "2023-09-26T18:10:51Z anti_dependency_analyzer finished after 0.547 seconds\n", + "2023-09-26T18:10:51Z Running post_sched\n", + "2023-09-26T18:10:54Z post_sched finished after 2.584 seconds\n", + "2023-09-26T18:10:54Z Running address_rotation_sb\n", + "2023-09-26T18:10:56Z address_rotation_sb finished after 1.982 seconds\n", + "2023-09-26T18:10:56Z Running anti_dependency_analyzer\n", + "2023-09-26T18:10:56Z anti_dependency_analyzer finished after 0.531 seconds\n", + "2023-09-26T18:10:56Z Running dep_opt\n", + "2023-09-26T18:10:57Z dep_opt finished after 0.753 seconds\n", + "2023-09-26T18:10:57Z Running report_stats\n", + "2023-09-26T18:10:57Z report_stats finished after 0.037 seconds\n", + "2023-09-26T18:10:57Z Running assign_trigger_engine\n", + "2023-09-26T18:10:57Z assign_trigger_engine finished after 0.055 seconds\n", + "2023-09-26T18:10:57Z Running alloc_queues\n", + "2023-09-26T18:10:57Z alloc_queues finished after 0.014 seconds\n", + "2023-09-26T18:10:57Z Running dep_reduction\n", + "2023-09-26T18:10:58Z dep_reduction finished after 0.807 seconds\n", + "2023-09-26T18:10:58Z Running bir_racecheck\n", + "2023-09-26T18:10:59Z bir_racecheck finished after 0.562 seconds\n", + "2023-09-26T18:10:59Z Running lower_dma\n", + "2023-09-26T18:10:59Z lower_dma finished after 0.047 seconds\n", + "2023-09-26T18:10:59Z Running coalesce_dma_blocks\n", + "2023-09-26T18:10:59Z coalesce_dma_blocks finished after 0.080 seconds\n", + "2023-09-26T18:10:59Z Running alloc_semaphores\n", + "2023-09-26T18:10:59Z alloc_semaphores finished after 0.114 seconds\n", + "2023-09-26T18:10:59Z Running expand_inst_late\n", + "2023-09-26T18:10:59Z expand_inst_late finished after 0.019 seconds\n", + "2023-09-26T18:10:59Z Running lower_sync\n", + "2023-09-26T18:10:59Z lower_sync finished after 0.022 seconds\n", + "2023-09-26T18:10:59Z Running lower_act\n", + "2023-09-26T18:10:59Z lower_act finished after 0.079 seconds\n", + "2023-09-26T18:10:59Z Running lower_dve\n", + "2023-09-26T18:10:59Z lower_dve finished after 0.214 seconds\n", + "2023-09-26T18:10:59Z Running lower_ap\n", + "2023-09-26T18:10:59Z lower_ap finished after 0.034 seconds\n", + "2023-09-26T18:11:00Z Running alloc_regs\n", + "2023-09-26T18:11:00Z alloc_regs finished after 0.003 seconds\n", + "2023-09-26T18:11:00Z Running birverifier\n", + "2023-09-26T18:11:00Z birverifier finished after 0.318 seconds\n", + "2023-09-26T18:11:00Z Running codegen\n", + "2023-09-26T18:11:02Z codegen finished after 1.596 seconds\n", + "2023-09-26T18:11:02Z Running neff_packager\n", + "2023-09-26T18:11:02Z neff_packager finished after 0.041 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2023-09-26T18:11:03Z Wrote /tmp/tmpxumd1tly/graph.neff\n", + "2023-09-26T18:11:03Z Compiler status PASS\n" + ] + } + ], + "source": [ + "#Compile model\n", + "with ubench_utils.Timer() as compilation_time:\n", + " trace = torch_neuronx.trace(model, \n", + " matrix_cpu, \n", + " compiler_args=NEURON_CC_FLAGS)\n", + "\n", + "# Save model to disk \n", + "torch.jit.save(trace, 'model.pt')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + " and execution:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Load model on NeuronCore\n", + "neuron_model = torch.jit.load('model.pt')\n", + "\n", + "# Warmup\n", + "with ubench_utils.Timer() as warmup_model_time:\n", + " out = neuron_model(matrix_cpu)\n", + "\n", + "# Timed run\n", + "with ubench_utils.Timer() as benchmark_time:\n", + " for i in range(N_TIMED_ITERATIONS):\n", + " out = neuron_model(matrix_cpu)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then report time taken for each step:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compilation took 251.4736s, warmup took 0.0284s, benchmark took 27.4398s\n" + ] + } + ], + "source": [ + "print(\"\"\"Compilation took {:.4f}s, warmup took {:.4f}s, benchmark took {:.4f}s\"\"\"\n", + " .format(compilation_time(), \n", + " warmup_model_time(), \n", + " benchmark_time())) " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the timed run, we can calculate how much time each execution took, and what is the achieved performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timed run: overall runtime = 27.4398s, runtime per iteration = 0.0274398s, timed iterations = 1000\n", + "PE TOPS = 78.2617\n" + ] + } + ], + "source": [ + "print(\"Timed run: overall runtime = {:2g}s, runtime per iteration = {:2g}s, timed iterations = {}\"\n", + " .format(benchmark_time(),\n", + " benchmark_time() / N_TIMED_ITERATIONS, N_TIMED_ITERATIONS))\n", + "\n", + "# Total operation count\n", + "top_per_run = BATCH_SIZE*(MATRIX_DIM**3)*N_TIMED_ITERATIONS*LOOP_COUNT*2\n", + "# Tera operations per second (TOPS)\n", + "tops = (top_per_run/benchmark_time())/1e12\n", + "print(\"PE TOPS = {:2g}\".format(tops))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full example\n", + "\n", + "A complete, parametrizable example of matrix multiplication benchmarks is in [matmult_linear.py](matmult_linear.py). It allows setting the batch size, matrix size, loop and iteration count, as well as additional parameters (listed using `python matmult_linear.py -h`). Example usage:\n", + "\n", + "```\n", + "python matmult_linear.py --batch_size 1 --matrix_dim 1024 --loop_count 1000 --num_warmup_iterations 2 --num_timed_iterations 1000 --add_bias\n", + "```\n", + "\n", + "If you ran the code is notebook, please terminate it before attempting to run any other code on the neuron devices." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmarking other workloads\n", + "\n", + "The methodology presented above can be extended to other workloads (even full models), using the following steps:\n", + "\n", + "- Modify the `class Matmult` to reflect your workload.\n", + "- Modify the parameters (e.g. `BATCH_SIZE`, `MATRIX_DIM`) to reflect your workload.\n", + "- Modify the input (e.g. `matrix_cpu`) as necessary for your workload.\n", + "- Modify the `top_per_run` formula according to your workload." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ai-ml/jupyterhub/examples/notebook-examples/pyspark.ipynb b/ai-ml/jupyterhub/examples/notebook-examples/pyspark.ipynb new file mode 100644 index 000000000..8ce181010 --- /dev/null +++ b/ai-ml/jupyterhub/examples/notebook-examples/pyspark.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Import SparkSession\n", + "from pyspark.sql import SparkSession\n", + "\n", + "# Create SparkSession \n", + "spark = SparkSession.builder \\\n", + " .master(\"local[1]\") \\\n", + " .appName(\"PySparkJupyterHub\") \\\n", + " .getOrCreate() \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "data = [('James','','Smith','1991-04-01','M',3000),\n", + " ('Michael','Rose','','2000-05-19','M',4000),\n", + " ('Robert','','Williams','1978-09-05','M',4000),\n", + " ('Maria','Anne','Jones','1967-12-01','F',4000),\n", + " ('Jen','Mary','Brown','1980-02-17','F',-1)\n", + "]\n", + "\n", + "columns = [\"firstname\",\"middlename\",\"lastname\",\"dob\",\"gender\",\"salary\"]\n", + "df = spark.createDataFrame(data=data, schema = columns)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.createOrReplaceTempView(\"PERSON_DATA\")\n", + "df2 = spark.sql(\"SELECT * from PERSON_DATA where salary >= 4000\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.show()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ai-ml/jupyterhub/examples/notebook-examples/sample-gpu-tensorflow.ipynb b/ai-ml/jupyterhub/examples/notebook-examples/sample-gpu-tensorflow.ipynb deleted file mode 100644 index 5497bae78..000000000 --- a/ai-ml/jupyterhub/examples/notebook-examples/sample-gpu-tensorflow.ipynb +++ /dev/null @@ -1,139 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! python3 -m pip install --upgrade tensorrt\n", - "! pip3 install matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow as tf\n", - "import time\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Configuration\n", - "matrix_size = 1000 # Increase this value for larger tensors (takes longer time)\n", - "\n", - "def matrix_multiply_cpu():\n", - " # Create two large random matrices\n", - " matrix_a = tf.random.normal(shape=(matrix_size, matrix_size))\n", - " matrix_b = tf.random.normal(shape=(matrix_size, matrix_size))\n", - "\n", - " # Perform matrix multiplication on CPU\n", - " start_time = time.time()\n", - " result = tf.matmul(matrix_a, matrix_b)\n", - " end_time = time.time()\n", - " return end_time - start_time\n", - "\n", - "def matrix_multiply_gpu():\n", - " # Create two large random matrices\n", - " matrix_a = tf.random.normal(shape=(matrix_size, matrix_size))\n", - " matrix_b = tf.random.normal(shape=(matrix_size, matrix_size))\n", - "\n", - " # Perform matrix multiplication on GPU\n", - " with tf.device('/GPU:0'):\n", - " start_time = time.time()\n", - " result = tf.matmul(matrix_a, matrix_b)\n", - " end_time = time.time()\n", - " return end_time - start_time\n", - "\n", - "# Main function\n", - "def main():\n", - " cpu_times = []\n", - " gpu_times = []\n", - "\n", - " for _ in range(5): # Repeat the operation multiple times for better comparison\n", - " cpu_time = matrix_multiply_cpu()\n", - " gpu_time = matrix_multiply_gpu()\n", - "\n", - " cpu_times.append(cpu_time)\n", - " gpu_times.append(gpu_time)\n", - "\n", - " # Print the average times\n", - " print(\"Average time for CPU:\", sum(cpu_times) / len(cpu_times), \"seconds\")\n", - " print(\"Average time for GPU:\", sum(gpu_times) / len(gpu_times), \"seconds\")\n", - "\n", - " # Plot the results\n", - " plt.bar([\"CPU\", \"GPU\"], [sum(cpu_times) / len(cpu_times), sum(gpu_times) / len(gpu_times)])\n", - " plt.xlabel(\"Device\")\n", - " plt.ylabel(\"Average Time (s)\")\n", - " plt.title(\"Performance Comparison: CPU vs. GPU\")\n", - " plt.show()\n", - "\n", - "if __name__ == \"__main__\":\n", - " main()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow as tf\n", - "from sklearn.datasets import load_iris\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "# Check if TensorFlow is using the GPU\n", - "print(\"TensorFlow is using GPU:\", tf.test.is_gpu_available())\n", - "\n", - "# Load Iris dataset\n", - "iris = load_iris()\n", - "X, y = iris.data, iris.target\n", - "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "# Scale the data to have zero mean and unit variance\n", - "scaler = StandardScaler()\n", - "X_train = scaler.fit_transform(X_train)\n", - "X_test = scaler.transform(X_test)\n", - "\n", - "# Define the neural network model\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.Dense(10, activation='relu', input_shape=(4,)),\n", - " tf.keras.layers.Dense(3, activation='softmax')\n", - "])\n", - "\n", - "# Compile the model\n", - "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", - "\n", - "# Train the model\n", - "epochs = 50\n", - "batch_size = 16\n", - "model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)\n", - "\n", - "# Evaluate the model on the test set\n", - "loss, accuracy = model.evaluate(X_test, y_test)\n", - "print(\"Test Accuracy:\", accuracy)\n", - "\n", - "# Make predictions on the test set\n", - "y_pred_probabilities = model.predict(X_test)\n", - "y_pred = tf.argmax(y_pred_probabilities, axis=1)\n", - "\n", - "# Calculate and print the accuracy on the test set\n", - "test_accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"Test Accuracy (calculated):\", test_accuracy)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/ai-ml/jupyterhub/examples/test-pods/timeslicing-test.yaml b/ai-ml/jupyterhub/examples/test-pods/timeslicing-test.yaml new file mode 100644 index 000000000..45bf53d14 --- /dev/null +++ b/ai-ml/jupyterhub/examples/test-pods/timeslicing-test.yaml @@ -0,0 +1,36 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: time-slicing-verification + labels: + app: time-slicing-verification +spec: + replicas: 2 + selector: + matchLabels: + app: time-slicing-verification + template: + metadata: + labels: + app: time-slicing-verification + spec: + nodeSelector: + provisioner: gpu-ts + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + hostPID: true + containers: + - name: cuda-sample-vector-add + image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" + command: ["/bin/bash", "-c", "--"] + args: + - while true; do /cuda-samples/vectorAdd; done + resources: + limits: + nvidia.com/gpu: 1 diff --git a/ai-ml/jupyterhub/examples/test-pods/ts-test-pod.yaml b/ai-ml/jupyterhub/examples/test-pods/ts-test-pod.yaml deleted file mode 100644 index 02a7bfd62..000000000 --- a/ai-ml/jupyterhub/examples/test-pods/ts-test-pod.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: inflate-gpu-ts - namespace: jupyterhub -spec: - replicas: 1 - selector: - matchLabels: - app: nginx - template: - metadata: - labels: - app: nginx - spec: - nodeSelector: - karpenter.sh/provisioner-name: gpu # Force schedule a node with time slicing support - tolerations: # To tolerate the taint on the nodes - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - containers: - - name: nginx - image: nginx - ports: - - containerPort: 80 diff --git a/ai-ml/jupyterhub/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/jupyterhub/helm-values/aws-cloudwatch-metrics-values.yaml deleted file mode 100644 index ae3c41d44..000000000 --- a/ai-ml/jupyterhub/helm-values/aws-cloudwatch-metrics-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 200m - memory: 1Gi - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-cpu.yaml b/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-cpu.yaml deleted file mode 100755 index d4c9b14dc..000000000 --- a/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-cpu.yaml +++ /dev/null @@ -1,64 +0,0 @@ -proxy: - https: - enabled: true - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} - service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - -singleuser: - image: - name: jupyter/base-notebook - tag: latest - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true # Enable this to execute sudo inside notebook instance - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: "start-singleuser.sh" -hub: - config: - GenericOAuthenticator: - oauth_callback_url: ${jupyterdomain} - client_id: ${client_id} - client_secret: ${client_secret} - authorize_url: ${authorize_url} - token_url: ${token_url} - userdata_url: ${userdata_url} - scope: - - openid - - email - username_key: "username" - login_service : "AWS Cognito" - userdata_method: "POST" - JupyterHub: - authenticator_class: generic-oauth diff --git a/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-gpu.yaml b/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-gpu.yaml deleted file mode 100755 index 1bbf18755..000000000 --- a/ai-ml/jupyterhub/helm-values/jupyterhub-values-cognito-gpu.yaml +++ /dev/null @@ -1,102 +0,0 @@ -proxy: - https: - enabled: true - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} - service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - -singleuser: - profileList: # Enable GPU accelerator for every user - - display_name: "GPU Server" - description: "Spawns a notebook server with access to a GPU" - kubespawner_override: - extra_resource_limits: - nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode - # nvidia.com/mig-1g.5gb: 1 # NVIDIA MIG: Use this config with MIG instead - image: - name: cschranz/gpu-jupyter # Base image with GPU drivers available - tag: v1.5_cuda-11.6_ubuntu-20.04_python-only - extraTolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - cpu: # Amount of CPU and memory assgined to each notebook pod - limit: 2 - guarantee: 2 - memory: - limit: 4G - guarantee: 4G - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: "start-singleuser.sh" - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - -prePuller: # Pre pulling disable since Notebook image is large - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -hub: - config: - GenericOAuthenticator: - oauth_callback_url: ${jupyterdomain} - client_id: ${client_id} - client_secret: ${client_secret} - authorize_url: ${authorize_url} - token_url: ${token_url} - userdata_url: ${userdata_url} - scope: - - openid - - email - username_key: "username" - login_service : "AWS Cognito" - userdata_method: "POST" - JupyterHub: - authenticator_class: generic-oauth diff --git a/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-cpu.yaml b/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-cpu.yaml deleted file mode 100755 index 32ebe0902..000000000 --- a/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-cpu.yaml +++ /dev/null @@ -1,47 +0,0 @@ -proxy: - https: - enabled: true - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} - service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - -singleuser: - image: - name: jupyter/base-notebook - tag: latest - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true # Enable this to execute sudo inside notebook instance - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: "start-singleuser.sh" diff --git a/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-gpu.yaml b/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-gpu.yaml deleted file mode 100755 index 9beb3247b..000000000 --- a/ai-ml/jupyterhub/helm-values/jupyterhub-values-dummy-gpu.yaml +++ /dev/null @@ -1,85 +0,0 @@ -proxy: - https: - enabled: false - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 -singleuser: - profileList: # Enable GPU accelerator for every user - - display_name: "GPU Server" - description: "Spawns a notebook server with access to a GPU" - kubespawner_override: - extra_resource_limits: - nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode - # nvidia.com/mig-1g.5gb: 1 # NVIDIA MIG: Use this config with MIG instead - image: - name: cschranz/gpu-jupyter # Base image with GPU drivers available - tag: v1.5_cuda-11.6_ubuntu-20.04_python-only - extraTolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - cpu: # Amount of CPU and memory assgined to each notebook pod - limit: 2 - guarantee: 2 - memory: - limit: 4G - guarantee: 4G - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: "start-singleuser.sh" - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - - -prePuller: - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -global: - safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml b/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml new file mode 100644 index 000000000..6990109a0 --- /dev/null +++ b/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml @@ -0,0 +1,80 @@ +global: + +#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server +# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata +hostNetwork: true +dnsPolicy: ClusterFirstWithHostNet + +service: + parsersFiles: + - /fluent-bit/parsers/parsers.conf + extraParsers: | + [PARSER] + Name kubernetes + Format regex + Regex ^(?[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ + +input: + name: "tail" + enabled: true + tag: "systempods....-" + path: "/var/log/containers/*.log" + db: "/var/log/flb_kube.db" + memBufLimit: 5MB + skipLongLines: "On" + refreshInterval: 10 + extraInputs: | + multiline.parser docker, cri + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ + + +# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters +filter: + name: "kubernetes" + match: "systempods.*" + kubeURL: "https://kubernetes.default.svc.cluster.local:443" + mergeLog: "On" + mergeLogKey: "log_processed" + keepLog: "On" + k8sLoggingParser: "On" + k8sLoggingExclude: "Off" + bufferSize: "0" + extraFilters: | + Kube_Tag_Prefix systempods. + Regex_Parser kubernetes + Labels On + Annotations Off + Use_Kubelet true + Kubelet_Port 10250 + Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token + +# CATION: Donot use `cloudwatch` plugin. This Golang Plugin is not recommnded by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. +# cloudWatch: +# enabled: false + +# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch +cloudWatchLogs: + enabled: true + match: "systempods.*" + region: ${region} + logGroupName: ${cloudwatch_log_group} + autoCreateGroup: false + extraOutputs: | + log_key log + +# Resource config for large clusters +resources: + limits: + cpu: 1000m + memory: 1500Mi + requests: + cpu: 500m + memory: 500Mi + +## Assign a PriorityClassName to pods if set +priorityClassName: system-node-critical + +# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. +tolerations: + - operator: Exists diff --git a/ai-ml/jupyterhub/helm-values/cluster-autoscaler-values.yaml b/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml similarity index 100% rename from ai-ml/jupyterhub/helm-values/cluster-autoscaler-values.yaml rename to ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml diff --git a/ai-ml/jupyterhub/helm-values/coredns-autoscaler-values.yaml b/ai-ml/jupyterhub/helm/coredns-autoscaler/values.yaml similarity index 100% rename from ai-ml/jupyterhub/helm-values/coredns-autoscaler-values.yaml rename to ai-ml/jupyterhub/helm/coredns-autoscaler/values.yaml diff --git a/ai-ml/jupyterhub/helm/efs/Chart.yaml b/ai-ml/jupyterhub/helm/efs/Chart.yaml new file mode 100644 index 000000000..e69ed7f3d --- /dev/null +++ b/ai-ml/jupyterhub/helm/efs/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: efs +description: Helm chart for efs options on the cluster +version: 0.0.1 +appVersion: 0.0.1 diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml new file mode 100644 index 000000000..c10646f80 --- /dev/null +++ b/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: {{ .Values.pv.name }} +spec: + capacity: + storage: 123Gi + accessModes: + - ReadWriteMany + nfs: + server: {{ .Values.pv.dnsName }} + path: "/" diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml new file mode 100644 index 000000000..cd0a962d9 --- /dev/null +++ b/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.pvc.name }} +spec: + accessModes: + - ReadWriteMany + storageClassName: "" + resources: + requests: + storage: 1Gi diff --git a/ai-ml/jupyterhub/helm/efs/values.yaml b/ai-ml/jupyterhub/helm/efs/values.yaml new file mode 100644 index 000000000..703735ddd --- /dev/null +++ b/ai-ml/jupyterhub/helm/efs/values.yaml @@ -0,0 +1,5 @@ +pv: + name: efs-persist + dnsName: +pvc: + name: efs-persist diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml new file mode 100755 index 000000000..56e33efe7 --- /dev/null +++ b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml @@ -0,0 +1,301 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + command: ["sh", "-c", "pip install boto3 && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py"] + config: + GenericOAuthenticator: + oauth_callback_url: ${jupyterdomain} + client_id: ${client_id} + client_secret: ${client_secret} + authorize_url: ${authorize_url} + token_url: ${token_url} + userdata_url: ${userdata_url} + scope: + - openid + - email + username_key: "username" + login_service : "AWS Cognito" + userdata_method: "POST" + JupyterHub: + authenticator_class: generic-oauth + extraConfig: + jupyterhub_config.py: |- + c.KubeSpawner.start_timeout = 1200 + c.Authenticator.enable_auth_state = True + + cognito_config.py: |- + import boto3 + def auth_state_hook(spawner, auth_state): + client_idp = boto3.client('cognito-idp', region_name="${region}") + auth_response = client_idp.initiate_auth( + AuthFlow="REFRESH_TOKEN_AUTH", + AuthParameters={ + "REFRESH_TOKEN": auth_state['refresh_token'], + "SECRET_HASH": "${client_secret}" + }, + ClientId="${client_id}" + ) + id_token = auth_response["AuthenticationResult"]["IdToken"] + client_identity = boto3.client("cognito-identity", region_name="${region}") + identity_response = client_identity.get_id( + IdentityPoolId="${identity_pool_id}", + Logins={ + f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token + } + ) + identity_id = identity_response['IdentityId'] + credentials = client_identity.get_credentials_for_identity( + IdentityId=identity_id, + Logins={ + f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token + } + ) + key = credentials["Credentials"]["AccessKeyId"] + secret = credentials["Credentials"]["SecretKey"] + token = credentials["Credentials"]["SessionToken"] + spawner.environment['AWS_ACCESS_KEY_ID'] = key + spawner.environment['AWS_SECRET_ACCESS_KEY'] = secret + spawner.environment['AWS_SESSION_TOKEN'] = token + + c.Spawner.auth_state_hook = auth_state_hook + +proxy: + https: + enabled: true + type: offload + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} + service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' + service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 + +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU inlcuding the image pull + profileList: + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + display_name: "PySpark 3.5.0 + Python 3.11" + default: true + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.5.0 + pyspark341: + display_name: "PySpark 3.4.1 + Python 3.11" + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.4.1 + kubespawner_override: + node_selector: + NodePool: default + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + # NOTE: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + NodePool: trainium + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + NodePool: inferentia + hub.jupyter.org/node-purpose: user + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + cpu_guarantee: 20 + mem_guarantee: 100G + cpu_limit: 20 + mem_limit: 100G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: null + - display_name: Data Science (GPU + Time-Slicing - G5) + default: true + description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" + kubespawner_override: + # namespace: data-team-a + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_limit: 2 + mem_limit: 4G + cpu_guarantee: 2 + mem_guarantee: 4G + cmd: "start-singleuser.sh" + # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. + # Hence, this profile relies on Managed node groups with GPU MIG enabled + - display_name: Data Science (GPU + MIG on P4d.24xlarge) + description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + provisioner: cluster-autoscaler + node.kubernetes.io/instance-type: p4d.24xlarge + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_guarantees: + nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb + # extra_resource_limits: + # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + - display_name: Data Science (GPU - P4d.24xlarge) + description: "GPU with P4d instances | Karpenter Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + node.kubernetes.io/instance-type: p4d.24xlarge + NodePool: gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "8" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "home/{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + readOnly: false + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 + userPods: + nodeAffinity: + matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml new file mode 100755 index 000000000..3e67a06f4 --- /dev/null +++ b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml @@ -0,0 +1,245 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + +proxy: + https: + enabled: false + type: offload + service: + type: ClusterIP + # Disabled LoadBalancer type +# annotations: +# service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "ssl_cert_arn" +# service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" +# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" +# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" +# service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip +# service.beta.kubernetes.io/aws-load-balancer-scheme: internal +# service.beta.kubernetes.io/aws-load-balancer-type: external +# service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' +# service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull + profileList: + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + display_name: "PySpark 3.5.0 + Python 3.11" + default: true + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.5.0 + pyspark341: + display_name: "PySpark 3.4.1 + Python 3.11" + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.4.1 + kubespawner_override: + node_selector: + NodePool: default + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + # NOTE: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + NodePool: trainium + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + NodePool: inferentia + hub.jupyter.org/node-purpose: user + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + cpu_guarantee: 20 + mem_guarantee: 100G + cpu_limit: 20 + mem_limit: 100G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: null + - display_name: Data Science (GPU + Time-Slicing - G5) + default: true + description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" + kubespawner_override: + # namespace: data-team-a + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_limit: 2 + mem_limit: 4G + cpu_guarantee: 2 + mem_guarantee: 4G + cmd: "start-singleuser.sh" + # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. + # Hence, this profile relies on Managed node groups with GPU MIG enabled + - display_name: Data Science (GPU + MIG on P4d.24xlarge) + description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + provisioner: cluster-autoscaler + node.kubernetes.io/instance-type: p4d.24xlarge + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_guarantees: + nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb + # extra_resource_limits: + # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + - display_name: Data Science (GPU - P4d.24xlarge) + description: "GPU with P4d instances | Karpenter Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + node_selector: + node.kubernetes.io/instance-type: p4d.24xlarge + NodePool: gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "8" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "home/{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + readOnly: false + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 + userPods: + nodeAffinity: + matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/Chart.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/Chart.yaml new file mode 100644 index 000000000..0c3b8474a --- /dev/null +++ b/ai-ml/jupyterhub/helm/karpenter-resources/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: karpenter-resources +description: Helm chart for configuring custom resources for Karpenter on the cluster +version: 0.0.1 +appVersion: 0.0.1 diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-class.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-class.yaml new file mode 100644 index 000000000..4aa617dfa --- /dev/null +++ b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-class.yaml @@ -0,0 +1,55 @@ +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: {{ .Values.name }} +spec: + amiFamily: {{ .Values.amiFamily }} + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: {{ .Values.clusterName }} + kubernetes.io/role/internal-elb: "1" # Make sure that it will be scheduled on private subs + securityGroupSelectorTerms: + - tags: + Name: {{ .Values.clusterName }}-node + role: {{ .Values.karpenterRole }} + # Optional, propagates tags to underlying EC2 resources + tags: + Name: {{ .Values.name }} + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: required + blockDeviceMappings: + {{ if eq .Values.amiFamily "Ubuntu" }} + - deviceName: /dev/sda1 + {{ else }} + - deviceName: /dev/xvda + {{ end }} + ebs: + volumeSize: 200Gi + volumeType: gp3 + encrypted: true + deleteOnTermination: true + detailedMonitoring: true + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + # Configure NVMe volumes in RAID0 configuration + # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 + # Mount will be: /mnt/k8s-disks + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- \ No newline at end of file diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-pool.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-pool.yaml new file mode 100644 index 000000000..51b091b1a --- /dev/null +++ b/ai-ml/jupyterhub/helm/karpenter-resources/templates/node-pool.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: {{ .Values.name }} +spec: + template: + metadata: + labels: + NodePool: {{ .Values.name }} + NodeGroupType: {{ .Values.name }} + hub.jupyter.org/node-purpose: user + spec: + nodeClassRef: + name: {{ .Values.name }} + {{- with .Values.taints }} + taints: + {{- toYaml . | nindent 8 }} + {{- end }} + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + {{- with .Values.instanceFamilies }} + values: + {{- toYaml . | nindent 12 }} + {{- end }} + - key: "karpenter.k8s.aws/instance-size" + operator: In + {{- with .Values.instanceSizes }} + values: + {{- toYaml . | nindent 12 }} + {{- end }} + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + limits: + cpu: "1000" + memory: 1000Gi + weight: 10 \ No newline at end of file diff --git a/ai-ml/jupyterhub/helm/karpenter-resources/values.yaml b/ai-ml/jupyterhub/helm/karpenter-resources/values.yaml new file mode 100644 index 000000000..3a3160876 --- /dev/null +++ b/ai-ml/jupyterhub/helm/karpenter-resources/values.yaml @@ -0,0 +1,10 @@ +name: default +clusterName: +karpenterRole: +instanceSizes: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] +instanceFamilies: ["c5", "m5", "r5"] +taints: + - key: hub.jupyter.org/dedicated + value: "user" + effect: "NoSchedule" +amiFamily: AL2 diff --git a/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml b/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml new file mode 100644 index 000000000..1b13f6dec --- /dev/null +++ b/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml @@ -0,0 +1,80 @@ +prometheus: + prometheusSpec: + resources: + requests: + memory: 4Gi + cpu: 2 + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: gp3 + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + # Scrape Cost metrics for Kubecost and JupyterHub add-ons + additionalScrapeConfigs: + - job_name: kubecost + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + dns_sd_configs: + - names: + - kubecost-cost-analyzer.kubecost.svc + type: 'A' + port: 9003 + - job_name: jupyterhub + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /hub/metrics + scheme: http + dns_sd_configs: + - names: + - hub.jupyterhub.svc + type: 'A' + port: 8081 + - job_name: gpu-metrics + scrape_interval: 1m + metrics_path: /metrics + scheme: http + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - gpu-operator + relabel_configs: + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: kubernetes_node + +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true + resources: + requests: + memory: 4Gi + cpu: 2 + sidecar: + datasources: + alertmanager: + enabled: false + +kube-state-metrics: + metricLabelsAllowlist: + # to select jupyterhub component pods and get the hub usernames + - pods=[app,component,hub.jupyter.org/username] + # allowing all labels is probably fine for nodes, since they don't churn much, unlike pods + - nodes=[*] diff --git a/ai-ml/jupyterhub/helm/kubecost/values.yaml b/ai-ml/jupyterhub/helm/kubecost/values.yaml new file mode 100644 index 000000000..0f9441497 --- /dev/null +++ b/ai-ml/jupyterhub/helm/kubecost/values.yaml @@ -0,0 +1,65 @@ + +# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090 + +global: + # pricingCsv: + # enabled: false + # location: + # provider: "AWS" + # region: "us-east-1" + # URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI + # csvAccessCredentials: pricing-schema-access-secret + + # This Prometheus setup is reusing the existing Prometheus deployment + # Check for more docs under https://guide.kubecost.com/hc/en-us/articles/4407595941015 + prometheus: + fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090 + enabled: false + +# If you have node-exporter and/or KSM running on your cluster, follow this step to disable the Kubecost included versions. +prometheus: + nodeExporter: + enabled: false + serviceAccounts: + nodeExporter: + create: false + kubeStateMetrics: + enabled: false + +#imageVersion: prod-1.96.0 # commented to use the latest + +kubecostFrontend: + image: public.ecr.aws/kubecost/frontend + resources: + requests: + cpu: "200m" + memory: "512Mi" + +kubecostMetrics: + emitPodAnnotations: true + emitNamespaceAnnotations: true + +kubecostModel: + image: public.ecr.aws/kubecost/cost-model + resources: + requests: + cpu: "500m" + memory: "512Mi" + +# Set this to false if you're bringing your own service account. +#serviceAccount: +# create: false +# name: kubecost-cost-analyzer +# annotations: +# eks.amazonaws.com/role-arn: + +# Define persistence volume for cost-analyzer +persistentVolume: + size: 32Gi + dbSize: 32.0Gi + enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. + storageClass: gp3 + # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost + +grafana: + enabled: false diff --git a/ai-ml/jupyterhub/helm-values/metrics-server-values.yaml b/ai-ml/jupyterhub/helm/metrics-server/values.yaml similarity index 100% rename from ai-ml/jupyterhub/helm-values/metrics-server-values.yaml rename to ai-ml/jupyterhub/helm/metrics-server/values.yaml diff --git a/ai-ml/jupyterhub/helm-values/nvidia-values.yaml b/ai-ml/jupyterhub/helm/nvidia-gpu-operator/values.yaml similarity index 57% rename from ai-ml/jupyterhub/helm-values/nvidia-values.yaml rename to ai-ml/jupyterhub/helm/nvidia-gpu-operator/values.yaml index 140775867..4010a2d97 100644 --- a/ai-ml/jupyterhub/helm-values/nvidia-values.yaml +++ b/ai-ml/jupyterhub/helm/nvidia-gpu-operator/values.yaml @@ -31,8 +31,12 @@ operator: cpu: 200m memory: 100Mi +# mig.strategy should be set to mixed when MIG mode is not enabled on all GPUs on a node. +# Sets the Multi-Instance GPU (MIG) strategy to “mixed.” This means that the GPU can be partitioned into multiple instances with varying sizes, +# allowing different workloads to run concurrently on the same GPU. mig: - strategy: single + strategy: mixed + # https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html # Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers. @@ -53,11 +57,11 @@ toolkit: devicePlugin: enabled: true config: - name: time-slicing-config-all - default: any + name: time-slicing-config-fine + default: nvidia-a10g create: true data: - any: |- + nvidia-a10g: |- version: v1 flags: migStrategy: none @@ -66,37 +70,67 @@ devicePlugin: resources: - name: nvidia.com/gpu replicas: 4 - + nvidia-a100g: |- + version: v1 + flags: + migStrategy: mixed + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 8 + - name: nvidia.com/mig-1g.5gb + replicas: 2 + - name: nvidia.com/mig-2g.10gb + replicas: 2 + - name: nvidia.com/mig-3g.20gb + replicas: 3 + - name: nvidia.com/mig-7g.40gb + replicas: 7 + + +# dcgm is a daemonset that runs on each node and collects GPU metrics. dcgm: - enabled: false + enabled: true +# dcgm-exporter is a deployment that runs on the cluster and exposes GPU metrics to Prometheus. dcgmExporter: enabled: true +# gfd is a daemonset that runs on each node and watches for changes to the GPU topology. gfd: enabled: true +# mig-manager is a deployment that runs on the cluster and manages MIG devices. migManager: enabled: true - + env: + - name: WITH_REBOOT + value: "true" # Indicates that the MIG manager can reboot the node if required. This is sometimes necessary when changing MIG configurations. nodeStatusExporter: enabled: false +# gds is a daemonset that runs on each node and exposes GPU metrics to Prometheus. gds: enabled: false +# vpuManager is a deployment that runs on the cluster and manages VPU devices. vgpuManager: enabled: false +# vgpuDevicePlugin is a daemonset that runs on each node and exposes VPU devices to the cluster. vgpuDeviceManager: enabled: true +# vfioManager is a deployment that runs on the cluster and manages VFIO devices. vfioManager: enabled: true +#sandboxDevicePlugin is a daemonset that runs on each node and exposes sandbox devices to the cluster. sandboxDevicePlugin: enabled: true +# nodeFeatureDiscovery is a daemonset that runs on each node and exposes node features to the cluster. node-feature-discovery: enableNodeFeatureApi: true worker: diff --git a/ai-ml/jupyterhub/helm/storageclass/Chart.yaml b/ai-ml/jupyterhub/helm/storageclass/Chart.yaml new file mode 100644 index 000000000..74c9ff0e4 --- /dev/null +++ b/ai-ml/jupyterhub/helm/storageclass/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: storage +description: Helm chart for storage options on the cluster +version: 0.0.1 +appVersion: 0.0.1 diff --git a/ai-ml/jupyterhub/helm/storageclass/templates/storage-class-gp3.yaml b/ai-ml/jupyterhub/helm/storageclass/templates/storage-class-gp3.yaml new file mode 100644 index 000000000..2269839e1 --- /dev/null +++ b/ai-ml/jupyterhub/helm/storageclass/templates/storage-class-gp3.yaml @@ -0,0 +1,11 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3 + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: ebs.csi.aws.com +parameters: + type: gp3 + csi.storage.k8s.io/fstype: ext4 + encrypted: "true" diff --git a/ai-ml/jupyterhub/install.sh b/ai-ml/jupyterhub/install.sh index 73e4e11e5..b87db5117 100755 --- a/ai-ml/jupyterhub/install.sh +++ b/ai-ml/jupyterhub/install.sh @@ -1,30 +1,5 @@ #!/bin/bash -read -p "Enter Jupyter Auth mechanism, accepted values are 'dummy' or 'cognito': " jupyter_auth - -acm_certificate_domain="" -jupyterhub_domain="" - -if [ "$jupyter_auth" == "cognito" ]; then - read -p "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com :" acm_certificate_domain - read -p "Enter sub-domain name for jupyterhub to be hosted, e.g. eks.example.com : " jupyterhub_domain -fi - -read -p "Enable GPU on Notebook instances? (y/n) " gpu_notebook - -if [ $gpu_notebook == "y" ]; then - gpu_notebook="gpu" -else - gpu_notebook="cpu" -fi - -# Print all variables above using echo -echo "Jupyter Auth mechanism: $jupyter_auth" -echo "ACM certificate domain: $acm_certificate_domain" -echo "Jupyterhub domain: $jupyterhub_domain" -echo "GPU Notebook: $gpu_notebook" - - echo "Initializing ..." terraform init || echo "\"terraform init\" failed" @@ -32,15 +7,13 @@ terraform init || echo "\"terraform init\" failed" targets=( "module.vpc" "module.eks" - "module.ebs_csi_driver_irsa" - "module.eks_blueprints_addons" ) # Apply modules in sequence for target in "${targets[@]}" do echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -var="jupyter_hub_auth_mechanism=$jupyter_auth" -var="jupyter_notebook_support=$gpu_notebook" -auto-approve 2>&1 | tee /dev/tty) + apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then echo "SUCCESS: Terraform apply of $target completed successfully" else @@ -51,7 +24,7 @@ done # Final apply to catch any remaining resources echo "Applying remaining resources..." -apply_output=$(terraform apply -var="acm_certificate_domain=$acm_certificate_domain" -var="jupyterhub_domain=$jupyterhub_domain" -var="jupyter_hub_auth_mechanism=$jupyter_auth" -var="jupyter_notebook_support=$gpu_notebook" -auto-approve 2>&1 | tee /dev/tty) +apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then echo "SUCCESS: Terraform apply of all modules completed successfully" else diff --git a/ai-ml/jupyterhub/jupyterhub.tf b/ai-ml/jupyterhub/jupyterhub.tf index fb909bfcd..3559d323e 100644 --- a/ai-ml/jupyterhub/jupyterhub.tf +++ b/ai-ml/jupyterhub/jupyterhub.tf @@ -47,50 +47,12 @@ resource "kubernetes_secret_v1" "jupyterhub_single_user" { type = "kubernetes.io/service-account-token" } -resource "kubectl_manifest" "storage_class_gp2" { - force_new = true - yaml_body = < https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html provider "aws" { diff --git a/ai-ml/jupyterhub/variables.tf b/ai-ml/jupyterhub/variables.tf index db64074ff..8e1f8b738 100755 --- a/ai-ml/jupyterhub/variables.tf +++ b/ai-ml/jupyterhub/variables.tf @@ -31,20 +31,13 @@ variable "secondary_cidr_blocks" { type = list(string) } - -# NOTE: You need to use private domain or public domain name with ACM certificate -# This website doc will show you how to create free public domain name with ACM certificate for testing purpose only +# NOTE: You need to use private domain or public domain name with ACM certificate +# Data-on-EKS website docs will show you how to create free public domain name with ACM certificate for testing purpose only # Example of public domain name(..com): eks.jupyter-doeks.dynamic-dns.com variable "jupyter_hub_auth_mechanism" { type = string description = "Allowed values: cognito, dummy" - default = "cognito" -} - -variable "jupyter_notebook_support" { - type = string - description = "Allowed values: cpu, gpu" - default = "cpu" + default = "dummy" } # Domain name is public so make sure you use a unique while deploying, Only needed if auth mechanism is set to cognito diff --git a/ai-ml/jupyterhub/versions.tf b/ai-ml/jupyterhub/versions.tf index 85abb2e37..9b6678a5f 100755 --- a/ai-ml/jupyterhub/versions.tf +++ b/ai-ml/jupyterhub/versions.tf @@ -12,15 +12,16 @@ terraform { } helm = { source = "hashicorp/helm" - version = ">= 2.4.1" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" + version = ">= 2.12.1" } random = { source = "hashicorp/random" version = "3.1.0" # Replace with the appropriate version of the random provider } + + archive = { + source = "hashicorp/archive" + version = "2.4.0" + } } } diff --git a/ai-ml/jupyterhub/vpc.tf b/ai-ml/jupyterhub/vpc.tf index e7e6473ee..59c3da89c 100755 --- a/ai-ml/jupyterhub/vpc.tf +++ b/ai-ml/jupyterhub/vpc.tf @@ -13,6 +13,9 @@ locals { #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/ai-ml/ray/terraform/main.tf b/ai-ml/ray/terraform/main.tf index f558df0ad..d6e9f5b49 100644 --- a/ai-ml/ray/terraform/main.tf +++ b/ai-ml/ray/terraform/main.tf @@ -92,8 +92,9 @@ module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.15" - cluster_name = local.name - cluster_version = local.cluster_version + cluster_name = local.name + cluster_version = local.cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id @@ -192,10 +193,12 @@ module "karpenter" { source = "terraform-aws-modules/eks/aws//modules/karpenter" version = "~> 19.15" - cluster_name = module.eks.cluster_name - irsa_oidc_provider_arn = module.eks.oidc_provider_arn - create_irsa = false # IRSA will be created by the kubernetes-addons module - iam_role_additional_policies = [module.karpenter_policy.arn] + cluster_name = module.eks.cluster_name + irsa_oidc_provider_arn = module.eks.oidc_provider_arn + create_irsa = false # IRSA will be created by the kubernetes-addons module + iam_role_additional_policies = { + additional_policy = module.karpenter_policy.arn + } tags = local.tags } diff --git a/ai-ml/ray/terraform/vpc.tf b/ai-ml/ray/terraform/vpc.tf index 1edfa9ee7..a39e745fb 100644 --- a/ai-ml/ray/terraform/vpc.tf +++ b/ai-ml/ray/terraform/vpc.tf @@ -2,6 +2,9 @@ # VPC #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf index 4c48825f2..56cccf392 100644 --- a/ai-ml/trainium-inferentia/addons.tf +++ b/ai-ml/trainium-inferentia/addons.tf @@ -216,6 +216,27 @@ module "eks_blueprints_addons" { ], } + #--------------------------------------- + # AWS Load Balancer Controller Add-on + #--------------------------------------- + enable_aws_load_balancer_controller = true + # turn off the mutating webhook for services because we are using + # service.beta.kubernetes.io/aws-load-balancer-type: external + aws_load_balancer_controller = { + set = [{ + name = "enableServiceMutatorWebhook" + value = "false" + }] + } + + #--------------------------------------- + # Ingress Nginx Add-on + #--------------------------------------- + enable_ingress_nginx = true + ingress_nginx = { + values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] + } + tags = local.tags } @@ -224,12 +245,40 @@ module "eks_blueprints_addons" { #--------------------------------------------------------------- module "eks_data_addons" { source = "aws-ia/eks-data-addons/aws" - version = "~> 1.0" # ensure to update this to the latest/desired version + version = "~> 1.2" # ensure to update this to the latest/desired version oidc_provider_arn = module.eks.oidc_provider_arn enable_aws_neuron_device_plugin = true enable_aws_efa_k8s_device_plugin = true + #--------------------------------------- + # Volcano Scheduler for TorchX + #--------------------------------------- + enable_volcano = true + + #--------------------------------------- + # Kuberay Operator + #--------------------------------------- + enable_kuberay_operator = true + kuberay_operator_helm_config = { + version = "1.0.0-rc.0" + # Enabling Volcano as Batch scheduler for KubeRay Operator + values = [ + <<-EOT + batchScheduler: + enabled: true + EOT + ] + } + + enable_jupyterhub = true + jupyterhub_helm_config = { + values = [ + templatefile("${path.module}/helm-values/jupyterhub-values.yaml", { + jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name + }) + ] + } } #--------------------------------------------------------------- @@ -239,49 +288,16 @@ data "http" "torchx_etcd_yaml" { url = "https://raw.githubusercontent.com/pytorch/torchx/main/resources/etcd.yaml" } -resource "kubectl_manifest" "torchx_etcd" { - yaml_body = <<-YAML - ${data.http.torchx_etcd_yaml.response_body} - YAML - - depends_on = [module.eks.eks_cluster_id] +data "kubectl_file_documents" "torchx_etcd_yaml" { + content = data.http.torchx_etcd_yaml.response_body } -#--------------------------------------------------------------- -# Volcano Schduler for TorchX -# NOTE: This will be replaced with Helm Chart deployment with eks_data_addons -#--------------------------------------------------------------- -data "http" "volcano_development_yaml" { - url = "https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml" -} - -resource "kubectl_manifest" "volcano" { - yaml_body = <<-YAML - ${data.http.volcano_development_yaml.response_body} - YAML - +resource "kubectl_manifest" "torchx_etcd" { + for_each = data.kubectl_file_documents.torchx_etcd_yaml.manifests + yaml_body = each.value depends_on = [module.eks.eks_cluster_id] } -#--------------------------------------------------------------- -# Create Volcano Queue once the Volcano add-on is installed -#--------------------------------------------------------------- -resource "kubectl_manifest" "volcano_queue" { - yaml_body = < /etc/apt/sources.list.d/neuron.list && \ + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + apt-get update -y && \ + apt-get install aws-neuronx-dkms=2.* aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y + +# Switch back to jovyan user for Python package installations +USER jovyan + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install wget awscli neuronx-cc==2.* torch-neuronx torchvision ipykernel environment_kernels transformers-neuronx sentencepiece transformers && \ + # Install new Jupyter Notebook kernel + python -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH diff --git a/ai-ml/trainium-inferentia/examples/docker/Dockerfile-tensorflow-neuronx b/ai-ml/trainium-inferentia/examples/docker/Dockerfile-tensorflow-neuronx new file mode 100644 index 000000000..4a18df903 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/docker/Dockerfile-tensorflow-neuronx @@ -0,0 +1,34 @@ +# Base image with Python 3.10 - https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/tensorflow/neuronx/ubuntu/tensorflow-neuronx-ubuntu22.html#setup-tensorflow-neuronx-u22 +FROM jupyter/base-notebook:python-3.10 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Install gnupg and other required packages +RUN apt-get update -y && \ + apt-get install -y gnupg git g++ + +RUN \ + . /etc/os-release && \ + echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + apt-get update -y && \ + apt-get install aws-neuronx-dkms=2.* aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y + +# Switch back to jovyan user for Python package installations +USER jovyan + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install wget awscli neuronx-cc==2.* tensorflow-neuronx ipykernel environment_kernels && \ + # Install new Jupyter Notebook kernel + python -m ipykernel install --user --name aws_neuron_venv_tensorflow --display-name "Python (tensorflow-neuronx)" + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH diff --git a/ai-ml/trainium-inferentia/examples/gradio-ui/README.md b/ai-ml/trainium-inferentia/examples/gradio-ui/README.md new file mode 100644 index 000000000..acf603f56 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/gradio-ui/README.md @@ -0,0 +1,56 @@ +# Steps to Deploy Gradio on Your Mac + +## Pre-requisites +Deploy the `trainium-inferentia` blueprint using this [link](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/trainium) + +## Step 1: Execute Port Forward to the Llama2 Ray Service +First, execute a port forward to the Llama2 Ray Service using kubectl: + +```bash +kubectl port-forward svc/llama2-service 8000:8000 -n llama2 +``` + +## Step 2: Deploy Gradio WebUI Locally + +### 2.1. Create a Virtual Environment +Create a virtual environment for the Gradio application: + +```bash +cd ai-ml/trainium-inferentia/examples/gradio-ui +python3 -m venv .venv +source .venv/bin/activate +``` +### 2.2. Install Gradio WebUI app + +Install all the Gradio WebUI app dependencies with pip + +```bash +pip install gradio requests +``` + +### 2.3. Invoke the WebUI +Run the Gradio WebUI using the following command: + +NOTE: `gradio-app.py` refers to the port forward url. e.g., `service_name = "http://localhost:8000" ` + +```bash +python gradio-app.py +``` + +You should see output similar to the following: +```text +Using cache from '~/data-on-eks/ai-ml/trainium-inferentia/examples/gradio-ui/gradio_cached_examples/16' directory. If method or examples have changed since last caching, delete this folder to clear cache. + +Running on local URL: http://127.0.0.1:7860 + +To create a public link, set `share=True` in `launch()`. +``` + +### 2.4. Access the WebUI from Your Browser +Open your web browser and access the Gradio WebUI by navigating to the following URL: + +http://127.0.0.1:7860 + +![gradio-app-llama2-chat.png](Llama2.png) + +You should now be able to interact with the Gradio application from your local machine. diff --git a/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app-llama2-chat.png b/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app-llama2-chat.png new file mode 100644 index 000000000..9ffc99e30 Binary files /dev/null and b/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app-llama2-chat.png differ diff --git a/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app.py b/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app.py new file mode 100644 index 000000000..7b8f6f535 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/gradio-ui/gradio-app.py @@ -0,0 +1,58 @@ +import gradio as gr +import requests + +# Constants for model endpoint and service name +model_endpoint = "/infer" +# service_name = "http:///serve" +service_name = "http://localhost:8000" # Replace with your actual service name + + +# Function to generate text +def text_generation(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.get(url, params={"sentence": prompt}, timeout=180) + response.raise_for_status() # Raise an exception for HTTP errors + + full_output = response.text + # Removing the original question from the output + answer_only = full_output.replace(prompt, "", 1).strip('["]?\n') + + # Safety filter to remove harmful or inappropriate content + answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + + +# Define the safety filter function (you can implement this as needed) +def filter_harmful_content(text): + # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text + + # For now, simply return the text as-is + return text + + +# Define the Gradio ChatInterface +chat_interface = gr.ChatInterface( + text_generation, + chatbot=gr.Chatbot(line_breaks=True), + textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), + title="Llama2 AI Chat", + description="Ask me any question", + theme="soft", + examples=["How many languages are in India", "What is Generative AI?"], + cache_examples=True, + retry_btn=None, + undo_btn="Delete Previous", + clear_btn="Clear", +) + +# Launch the ChatInterface +chat_interface.launch() diff --git a/ai-ml/trainium-inferentia/examples/jupyter-notebooks/llama2-chat-inf2.ipynb b/ai-ml/trainium-inferentia/examples/jupyter-notebooks/llama2-chat-inf2.ipynb new file mode 100644 index 000000000..a9b8b3942 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/jupyter-notebooks/llama2-chat-inf2.ipynb @@ -0,0 +1,714 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b2964d61-2f19-491f-96c2-adab2aff08b5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", + "Collecting transformers-neuronx\n", + " Using cached https://pip.repos.neuron.amazonaws.com/transformers-neuronx/transformers_neuronx-0.7.84-py3-none-any.whl (150 kB)\n", + "Collecting accelerate (from transformers-neuronx)\n", + " Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.0/261.0 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: torch-neuronx in /opt/conda/lib/python3.10/site-packages (from transformers-neuronx) (1.13.1.1.11.0)\n", + "Collecting transformers (from transformers-neuronx)\n", + " Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from accelerate->transformers-neuronx) (1.21.6)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from accelerate->transformers-neuronx) (23.1)\n", + "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate->transformers-neuronx) (5.9.5)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from accelerate->transformers-neuronx) (6.0)\n", + "Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate->transformers-neuronx) (1.13.1)\n", + "Collecting huggingface-hub (from accelerate->transformers-neuronx)\n", + " Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n", + "Requirement already satisfied: torch-xla==1.13.1+torchneuronb in /opt/conda/lib/python3.10/site-packages (from torch-neuronx->transformers-neuronx) (1.13.1+torchneuronb)\n", + "Requirement already satisfied: libneuronxla==0.5.476 in /opt/conda/lib/python3.10/site-packages (from torch-neuronx->transformers-neuronx) (0.5.476)\n", + "Requirement already satisfied: protobuf<5 in /opt/conda/lib/python3.10/site-packages (from torch-neuronx->transformers-neuronx) (3.20.3)\n", + "Requirement already satisfied: aws-neuronx-runtime-discovery~=2.0 in /opt/conda/lib/python3.10/site-packages (from libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.9)\n", + "Requirement already satisfied: neuronx-cc~=2.0 in /opt/conda/lib/python3.10/site-packages (from libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.10.0.34+6c8792c6f)\n", + "Requirement already satisfied: boto3~=1.26 in /opt/conda/lib/python3.10/site-packages (from libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (1.28.54)\n", + "Requirement already satisfied: botocore~=1.29 in /opt/conda/lib/python3.10/site-packages (from libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (1.31.54)\n", + "Requirement already satisfied: typing-extensions in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate->transformers-neuronx) (4.6.2)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate->transformers-neuronx) (11.7.99)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate->transformers-neuronx) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate->transformers-neuronx) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate->transformers-neuronx) (11.7.99)\n", + "Requirement already satisfied: absl-py>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (2.0.0)\n", + "Requirement already satisfied: cloud-tpu-client>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (0.10)\n", + "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.10.0->accelerate->transformers-neuronx) (67.7.2)\n", + "Requirement already satisfied: wheel in /opt/conda/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.10.0->accelerate->transformers-neuronx) (0.40.0)\n", + "Collecting filelock (from transformers->transformers-neuronx)\n", + " Using cached filelock-3.12.4-py3-none-any.whl (11 kB)\n", + "Collecting regex!=2019.12.17 (from transformers->transformers-neuronx)\n", + " Using cached regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from transformers->transformers-neuronx) (2.31.0)\n", + "Collecting tokenizers<0.15,>=0.14 (from transformers->transformers-neuronx)\n", + " Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", + "Collecting safetensors>=0.3.1 (from transformers->transformers-neuronx)\n", + " Using cached safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.10/site-packages (from transformers->transformers-neuronx) (4.65.0)\n", + "Collecting fsspec>=2023.5.0 (from huggingface-hub->accelerate->transformers-neuronx)\n", + " Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.4/166.4 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting huggingface-hub (from accelerate->transformers-neuronx)\n", + " Using cached huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->transformers-neuronx) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->transformers-neuronx) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->transformers-neuronx) (1.26.16)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->transformers->transformers-neuronx) (2023.5.7)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from boto3~=1.26->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /opt/conda/lib/python3.10/site-packages (from boto3~=1.26->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (0.6.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.10/site-packages (from botocore~=1.29->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.8.2)\n", + "Requirement already satisfied: google-api-python-client==1.8.0 in /opt/conda/lib/python3.10/site-packages (from cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (1.8.0)\n", + "Requirement already satisfied: oauth2client in /opt/conda/lib/python3.10/site-packages (from cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (4.1.3)\n", + "Requirement already satisfied: httplib2<1dev,>=0.9.2 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (0.22.0)\n", + "Requirement already satisfied: google-auth>=1.4.1 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (2.23.0)\n", + "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (0.1.1)\n", + "Requirement already satisfied: google-api-core<2dev,>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (1.34.0)\n", + "Requirement already satisfied: six<2dev,>=1.6.1 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (1.16.0)\n", + "Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (3.0.1)\n", + "Requirement already satisfied: neuronx-hwm==2.10.0.5 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.10.0.5+7b1976adf)\n", + "Requirement already satisfied: networkx<=2.6.3 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.6.3)\n", + "Requirement already satisfied: scipy<=1.7.3 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (1.7.3)\n", + "Requirement already satisfied: python-daemon>=2.2.4 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (3.0.1)\n", + "Requirement already satisfied: requests-unixsocket>=0.1.5 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (0.3.0)\n", + "Requirement already satisfied: islpy<=2023.1,>2021.1 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2023.1)\n", + "Requirement already satisfied: pgzip>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (0.3.5)\n", + "Requirement already satisfied: ec2-metadata<=2.10.0 in /opt/conda/lib/python3.10/site-packages (from neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (2.10.0)\n", + "Requirement already satisfied: docutils in /opt/conda/lib/python3.10/site-packages (from python-daemon>=2.2.4->neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (0.16)\n", + "Requirement already satisfied: lockfile>=0.10 in /opt/conda/lib/python3.10/site-packages (from python-daemon>=2.2.4->neuronx-cc~=2.0->libneuronxla==0.5.476->torch-neuronx->transformers-neuronx) (0.12.2)\n", + "Requirement already satisfied: pyasn1>=0.1.7 in /opt/conda/lib/python3.10/site-packages (from oauth2client->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (0.5.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.0.5 in /opt/conda/lib/python3.10/site-packages (from oauth2client->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (0.3.0)\n", + "Requirement already satisfied: rsa>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from oauth2client->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (4.7.2)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /opt/conda/lib/python3.10/site-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (1.60.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth>=1.4.1->google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (5.3.1)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /opt/conda/lib/python3.10/site-packages (from httplib2<1dev,>=0.9.2->google-api-python-client==1.8.0->cloud-tpu-client>=0.10.0->torch-xla==1.13.1+torchneuronb->torch-neuronx->transformers-neuronx) (3.1.1)\n", + "Installing collected packages: safetensors, regex, fsspec, filelock, huggingface-hub, tokenizers, transformers, accelerate, transformers-neuronx\n", + "Successfully installed accelerate-0.24.0 filelock-3.12.4 fsspec-2023.10.0 huggingface-hub-0.17.3 regex-2023.10.3 safetensors-0.4.0 tokenizers-0.14.1 transformers-4.34.1 transformers-neuronx-0.7.84\n", + "Collecting sentencepiece\n", + " Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "Installing collected packages: sentencepiece\n", + "Successfully installed sentencepiece-0.1.99\n", + "Collecting jupyter\n", + " Using cached jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)\n", + "Collecting ipywidgets\n", + " Using cached ipywidgets-8.1.1-py3-none-any.whl (139 kB)\n", + "Requirement already satisfied: notebook in /opt/conda/lib/python3.10/site-packages (from jupyter) (6.5.4)\n", + "Collecting qtconsole (from jupyter)\n", + " Using cached qtconsole-5.4.4-py3-none-any.whl (121 kB)\n", + "Collecting jupyter-console (from jupyter)\n", + " Using cached jupyter_console-6.6.3-py3-none-any.whl (24 kB)\n", + "Requirement already satisfied: nbconvert in /opt/conda/lib/python3.10/site-packages (from jupyter) (7.4.0)\n", + "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.10/site-packages (from jupyter) (6.23.1)\n", + "Requirement already satisfied: comm>=0.1.3 in /opt/conda/lib/python3.10/site-packages (from ipywidgets) (0.1.3)\n", + "Requirement already satisfied: ipython>=6.1.0 in /opt/conda/lib/python3.10/site-packages (from ipywidgets) (8.13.2)\n", + "Requirement already satisfied: traitlets>=4.3.1 in /opt/conda/lib/python3.10/site-packages (from ipywidgets) (5.9.0)\n", + "Collecting widgetsnbextension~=4.0.9 (from ipywidgets)\n", + " Using cached widgetsnbextension-4.0.9-py3-none-any.whl (2.3 MB)\n", + "Collecting jupyterlab-widgets~=3.0.9 (from ipywidgets)\n", + " Using cached jupyterlab_widgets-3.0.9-py3-none-any.whl (214 kB)\n", + "Requirement already satisfied: backcall in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.2.0)\n", + "Requirement already satisfied: decorator in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n", + "Requirement already satisfied: jedi>=0.16 in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.18.2)\n", + "Requirement already satisfied: matplotlib-inline in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.6)\n", + "Requirement already satisfied: pickleshare in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.7.5)\n", + "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.38)\n", + "Requirement already satisfied: pygments>=2.4.0 in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (2.15.1)\n", + "Requirement already satisfied: stack-data in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.2)\n", + "Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets) (4.8.0)\n", + "Requirement already satisfied: debugpy>=1.6.5 in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (1.6.7)\n", + "Requirement already satisfied: jupyter-client>=6.1.12 in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (8.2.0)\n", + "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (5.3.0)\n", + "Requirement already satisfied: nest-asyncio in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (1.5.6)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (23.1)\n", + "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (5.9.5)\n", + "Requirement already satisfied: pyzmq>=20 in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (25.0.2)\n", + "Requirement already satisfied: tornado>=6.1 in /opt/conda/lib/python3.10/site-packages (from ipykernel->jupyter) (6.3.2)\n", + "Requirement already satisfied: beautifulsoup4 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (4.12.2)\n", + "Requirement already satisfied: bleach in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (6.0.0)\n", + "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (0.7.1)\n", + "Requirement already satisfied: jinja2>=3.0 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (3.1.2)\n", + "Requirement already satisfied: jupyterlab-pygments in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (0.2.2)\n", + "Requirement already satisfied: markupsafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (2.1.2)\n", + "Requirement already satisfied: mistune<3,>=2.0.3 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (2.0.5)\n", + "Requirement already satisfied: nbclient>=0.5.0 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (0.8.0)\n", + "Requirement already satisfied: nbformat>=5.1 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (5.8.0)\n", + "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (1.5.0)\n", + "Requirement already satisfied: tinycss2 in /opt/conda/lib/python3.10/site-packages (from nbconvert->jupyter) (1.2.1)\n", + "Requirement already satisfied: argon2-cffi in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (21.3.0)\n", + "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (0.2.0)\n", + "Requirement already satisfied: Send2Trash>=1.8.0 in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (1.8.2)\n", + "Requirement already satisfied: terminado>=0.8.3 in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (0.17.1)\n", + "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (0.17.0)\n", + "Requirement already satisfied: nbclassic>=0.4.7 in /opt/conda/lib/python3.10/site-packages (from notebook->jupyter) (1.0.0)\n", + "Collecting qtpy>=2.4.0 (from qtconsole->jupyter)\n", + " Downloading QtPy-2.4.1-py3-none-any.whl (93 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.5/93.5 kB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: parso<0.9.0,>=0.8.0 in /opt/conda/lib/python3.10/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from jupyter-client>=6.1.12->ipykernel->jupyter) (2.8.2)\n", + "Requirement already satisfied: platformdirs>=2.5 in /opt/conda/lib/python3.10/site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter) (3.5.1)\n", + "Requirement already satisfied: jupyter-server>=1.8 in /opt/conda/lib/python3.10/site-packages (from nbclassic>=0.4.7->notebook->jupyter) (2.6.0)\n", + "Requirement already satisfied: notebook-shim>=0.2.3 in /opt/conda/lib/python3.10/site-packages (from nbclassic>=0.4.7->notebook->jupyter) (0.2.3)\n", + "Requirement already satisfied: fastjsonschema in /opt/conda/lib/python3.10/site-packages (from nbformat>=5.1->nbconvert->jupyter) (2.17.1)\n", + "Requirement already satisfied: jsonschema>=2.6 in /opt/conda/lib/python3.10/site-packages (from nbformat>=5.1->nbconvert->jupyter) (4.17.3)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.10/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n", + "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets) (0.2.6)\n", + "Requirement already satisfied: argon2-cffi-bindings in /opt/conda/lib/python3.10/site-packages (from argon2-cffi->notebook->jupyter) (21.2.0)\n", + "Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.10/site-packages (from beautifulsoup4->nbconvert->jupyter) (2.3.2.post1)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from bleach->nbconvert->jupyter) (1.16.0)\n", + "Requirement already satisfied: webencodings in /opt/conda/lib/python3.10/site-packages (from bleach->nbconvert->jupyter) (0.5.1)\n", + "Requirement already satisfied: executing>=1.2.0 in /opt/conda/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (1.2.0)\n", + "Requirement already satisfied: asttokens>=2.1.0 in /opt/conda/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.2.1)\n", + "Requirement already satisfied: pure-eval in /opt/conda/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.2)\n", + "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter) (23.1.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter) (0.19.3)\n", + "Requirement already satisfied: anyio>=3.1.0 in /opt/conda/lib/python3.10/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (3.6.2)\n", + "Requirement already satisfied: jupyter-events>=0.6.0 in /opt/conda/lib/python3.10/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (0.6.3)\n", + "Requirement already satisfied: jupyter-server-terminals in /opt/conda/lib/python3.10/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (0.4.4)\n", + "Requirement already satisfied: overrides in /opt/conda/lib/python3.10/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (7.3.1)\n", + "Requirement already satisfied: websocket-client in /opt/conda/lib/python3.10/site-packages (from jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (1.5.2)\n", + "Requirement already satisfied: cffi>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from argon2-cffi-bindings->argon2-cffi->notebook->jupyter) (1.15.1)\n", + "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.10/site-packages (from anyio>=3.1.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (3.4)\n", + "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.10/site-packages (from anyio>=3.1.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (1.3.0)\n", + "Requirement already satisfied: pycparser in /opt/conda/lib/python3.10/site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook->jupyter) (2.21)\n", + "Requirement already satisfied: python-json-logger>=2.0.4 in /opt/conda/lib/python3.10/site-packages (from jupyter-events>=0.6.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (2.0.7)\n", + "Requirement already satisfied: pyyaml>=5.3 in /opt/conda/lib/python3.10/site-packages (from jupyter-events>=0.6.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (6.0)\n", + "Requirement already satisfied: rfc3339-validator in /opt/conda/lib/python3.10/site-packages (from jupyter-events>=0.6.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (0.1.4)\n", + "Requirement already satisfied: rfc3986-validator>=0.1.1 in /opt/conda/lib/python3.10/site-packages (from jupyter-events>=0.6.0->jupyter-server>=1.8->nbclassic>=0.4.7->notebook->jupyter) (0.1.1)\n", + "Collecting fqdn (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached fqdn-1.5.1-py3-none-any.whl (9.1 kB)\n", + "Collecting isoduration (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached isoduration-20.11.0-py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: jsonpointer>1.13 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter) (2.0)\n", + "Collecting uri-template (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached uri_template-1.3.0-py3-none-any.whl (11 kB)\n", + "Collecting webcolors>=1.11 (from jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached webcolors-1.13-py3-none-any.whl (14 kB)\n", + "Collecting arrow>=0.15.0 (from isoduration->jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached arrow-1.3.0-py3-none-any.whl (66 kB)\n", + "Collecting types-python-dateutil>=2.8.10 (from arrow>=0.15.0->isoduration->jsonschema>=2.6->nbformat>=5.1->nbconvert->jupyter)\n", + " Using cached types_python_dateutil-2.8.19.14-py3-none-any.whl (9.4 kB)\n", + "Installing collected packages: types-python-dateutil, widgetsnbextension, webcolors, uri-template, qtpy, jupyterlab-widgets, fqdn, arrow, isoduration, ipywidgets, qtconsole, jupyter-console, jupyter\n", + "Successfully installed arrow-1.3.0 fqdn-1.5.1 ipywidgets-8.1.1 isoduration-20.11.0 jupyter-1.0.0 jupyter-console-6.6.3 jupyterlab-widgets-3.0.9 qtconsole-5.4.4 qtpy-2.4.1 types-python-dateutil-2.8.19.14 uri-template-1.3.0 webcolors-1.13 widgetsnbextension-4.0.9\n", + "Collecting fastapi\n", + " Downloading fastapi-0.104.0-py3-none-any.whl (92 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.9/92.9 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting anyio<4.0.0,>=3.7.1 (from fastapi)\n", + " Using cached anyio-3.7.1-py3-none-any.whl (80 kB)\n", + "Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 (from fastapi)\n", + " Using cached pydantic-2.4.2-py3-none-any.whl (395 kB)\n", + "Collecting starlette<0.28.0,>=0.27.0 (from fastapi)\n", + " Using cached starlette-0.27.0-py3-none-any.whl (66 kB)\n", + "Collecting typing-extensions>=4.8.0 (from fastapi)\n", + " Using cached typing_extensions-4.8.0-py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.10/site-packages (from anyio<4.0.0,>=3.7.1->fastapi) (3.4)\n", + "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.10/site-packages (from anyio<4.0.0,>=3.7.1->fastapi) (1.3.0)\n", + "Collecting exceptiongroup (from anyio<4.0.0,>=3.7.1->fastapi)\n", + " Using cached exceptiongroup-1.1.3-py3-none-any.whl (14 kB)\n", + "Collecting annotated-types>=0.4.0 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)\n", + " Using cached annotated_types-0.6.0-py3-none-any.whl (12 kB)\n", + "Collecting pydantic-core==2.10.1 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)\n", + " Using cached pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", + "Installing collected packages: typing-extensions, exceptiongroup, annotated-types, pydantic-core, anyio, starlette, pydantic, fastapi\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing_extensions 4.6.2\n", + " Uninstalling typing_extensions-4.6.2:\n", + " Successfully uninstalled typing_extensions-4.6.2\n", + " Attempting uninstall: anyio\n", + " Found existing installation: anyio 3.6.2\n", + " Uninstalling anyio-3.6.2:\n", + " Successfully uninstalled anyio-3.6.2\n", + "Successfully installed annotated-types-0.6.0 anyio-3.7.1 exceptiongroup-1.1.3 fastapi-0.104.0 pydantic-2.4.2 pydantic-core-2.10.1 starlette-0.27.0 typing-extensions-4.8.0\n", + "Collecting ray\n", + " Using cached ray-2.7.1-cp310-cp310-manylinux2014_x86_64.whl (62.4 MB)\n", + "Collecting click>=7.0 (from ray)\n", + " Using cached click-8.1.7-py3-none-any.whl (97 kB)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from ray) (3.12.4)\n", + "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (from ray) (4.17.3)\n", + "Collecting msgpack<2.0.0,>=1.0.0 (from ray)\n", + " Using cached msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (530 kB)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ray) (23.1)\n", + "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/lib/python3.10/site-packages (from ray) (3.20.3)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from ray) (6.0)\n", + "Collecting aiosignal (from ray)\n", + " Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", + "Collecting frozenlist (from ray)\n", + " Using cached frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from ray) (2.31.0)\n", + "Requirement already satisfied: numpy>=1.19.3 in /opt/conda/lib/python3.10/site-packages (from ray) (1.21.6)\n", + "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray) (23.1.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray) (0.19.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->ray) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->ray) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->ray) (1.26.16)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->ray) (2023.5.7)\n", + "Installing collected packages: msgpack, frozenlist, click, aiosignal, ray\n", + "Successfully installed aiosignal-1.3.1 click-8.1.7 frozenlist-1.4.0 msgpack-1.0.7 ray-2.7.1\n", + "Package Version\n", + "----------------------------- -------------------\n", + "absl-py 2.0.0\n", + "accelerate 0.24.0\n", + "aiosignal 1.3.1\n", + "alembic 1.11.1\n", + "annotated-types 0.6.0\n", + "anyio 3.7.1\n", + "argon2-cffi 21.3.0\n", + "argon2-cffi-bindings 21.2.0\n", + "arrow 1.3.0\n", + "asttokens 2.2.1\n", + "async-generator 1.10\n", + "async-lru 2.0.2\n", + "attrs 23.1.0\n", + "aws-neuronx-runtime-discovery 2.9\n", + "awscli 1.29.54\n", + "Babel 2.12.1\n", + "backcall 0.2.0\n", + "backports.functools-lru-cache 1.6.4\n", + "beautifulsoup4 4.12.2\n", + "bleach 6.0.0\n", + "blinker 1.6.2\n", + "boltons 23.0.0\n", + "boto3 1.28.54\n", + "botocore 1.31.54\n", + "cachetools 5.3.1\n", + "certifi 2023.5.7\n", + "certipy 0.1.3\n", + "cffi 1.15.1\n", + "charset-normalizer 3.1.0\n", + "click 8.1.7\n", + "cloud-tpu-client 0.10\n", + "colorama 0.4.4\n", + "comm 0.1.3\n", + "conda 23.3.1\n", + "conda-package-handling 2.0.2\n", + "conda_package_streaming 0.8.0\n", + "cryptography 40.0.2\n", + "debugpy 1.6.7\n", + "decorator 5.1.1\n", + "defusedxml 0.7.1\n", + "docutils 0.16\n", + "ec2-metadata 2.10.0\n", + "entrypoints 0.4\n", + "environment-kernels 1.2.0\n", + "exceptiongroup 1.1.3\n", + "executing 1.2.0\n", + "fastapi 0.104.0\n", + "fastjsonschema 2.17.1\n", + "filelock 3.12.4\n", + "flit_core 3.9.0\n", + "fqdn 1.5.1\n", + "frozenlist 1.4.0\n", + "fsspec 2023.10.0\n", + "google-api-core 1.34.0\n", + "google-api-python-client 1.8.0\n", + "google-auth 2.23.0\n", + "google-auth-httplib2 0.1.1\n", + "googleapis-common-protos 1.60.0\n", + "greenlet 2.0.2\n", + "httplib2 0.22.0\n", + "huggingface-hub 0.17.3\n", + "idna 3.4\n", + "importlib-metadata 6.6.0\n", + "importlib-resources 5.12.0\n", + "ipykernel 6.23.1\n", + "ipython 8.13.2\n", + "ipython-genutils 0.2.0\n", + "ipywidgets 8.1.1\n", + "islpy 2023.1\n", + "isoduration 20.11.0\n", + "jedi 0.18.2\n", + "Jinja2 3.1.2\n", + "jmespath 1.0.1\n", + "json5 0.9.5\n", + "jsonpatch 1.32\n", + "jsonpointer 2.0\n", + "jsonschema 4.17.3\n", + "jupyter 1.0.0\n", + "jupyter_client 8.2.0\n", + "jupyter-console 6.6.3\n", + "jupyter_core 5.3.0\n", + "jupyter-events 0.6.3\n", + "jupyter-lsp 2.1.0\n", + "jupyter_server 2.6.0\n", + "jupyter_server_terminals 0.4.4\n", + "jupyter-telemetry 0.1.0\n", + "jupyterhub 4.0.0\n", + "jupyterlab 4.0.1\n", + "jupyterlab-pygments 0.2.2\n", + "jupyterlab_server 2.22.1\n", + "jupyterlab-widgets 3.0.9\n", + "libmambapy 1.4.2\n", + "libneuronxla 0.5.476\n", + "lockfile 0.12.2\n", + "Mako 1.2.4\n", + "mamba 1.4.2\n", + "MarkupSafe 2.1.2\n", + "matplotlib-inline 0.1.6\n", + "mistune 2.0.5\n", + "msgpack 1.0.7\n", + "nbclassic 1.0.0\n", + "nbclient 0.8.0\n", + "nbconvert 7.4.0\n", + "nbformat 5.8.0\n", + "nest-asyncio 1.5.6\n", + "networkx 2.6.3\n", + "neuronx-cc 2.10.0.34+6c8792c6f\n", + "neuronx-hwm 2.10.0.5+7b1976adf\n", + "notebook 6.5.4\n", + "notebook_shim 0.2.3\n", + "numpy 1.21.6\n", + "nvidia-cublas-cu11 11.10.3.66\n", + "nvidia-cuda-nvrtc-cu11 11.7.99\n", + "nvidia-cuda-runtime-cu11 11.7.99\n", + "nvidia-cudnn-cu11 8.5.0.96\n", + "oauth2client 4.1.3\n", + "oauthlib 3.2.2\n", + "overrides 7.3.1\n", + "packaging 23.1\n", + "pamela 1.0.0\n", + "pandocfilters 1.5.0\n", + "parso 0.8.3\n", + "pexpect 4.8.0\n", + "pgzip 0.3.5\n", + "pickleshare 0.7.5\n", + "Pillow 10.0.1\n", + "pip 23.1.2\n", + "pkgutil_resolve_name 1.3.10\n", + "platformdirs 3.5.1\n", + "pluggy 1.0.0\n", + "prometheus-client 0.17.0\n", + "prompt-toolkit 3.0.38\n", + "protobuf 3.20.3\n", + "psutil 5.9.5\n", + "ptyprocess 0.7.0\n", + "pure-eval 0.2.2\n", + "pyasn1 0.5.0\n", + "pyasn1-modules 0.3.0\n", + "pycosat 0.6.4\n", + "pycparser 2.21\n", + "pycurl 7.45.1\n", + "pydantic 2.4.2\n", + "pydantic_core 2.10.1\n", + "Pygments 2.15.1\n", + "PyJWT 2.7.0\n", + "pyOpenSSL 23.1.1\n", + "pyparsing 3.1.1\n", + "pyrsistent 0.19.3\n", + "PySocks 1.7.1\n", + "python-daemon 3.0.1\n", + "python-dateutil 2.8.2\n", + "python-json-logger 2.0.7\n", + "pytz 2023.3\n", + "PyYAML 6.0\n", + "pyzmq 25.0.2\n", + "qtconsole 5.4.4\n", + "QtPy 2.4.1\n", + "ray 2.7.1\n", + "regex 2023.10.3\n", + "requests 2.31.0\n", + "requests-unixsocket 0.3.0\n", + "rfc3339-validator 0.1.4\n", + "rfc3986-validator 0.1.1\n", + "rsa 4.7.2\n", + "ruamel.yaml 0.17.29\n", + "ruamel.yaml.clib 0.2.7\n", + "s3transfer 0.6.2\n", + "safetensors 0.4.0\n", + "scipy 1.7.3\n", + "Send2Trash 1.8.2\n", + "sentencepiece 0.1.99\n", + "setuptools 67.7.2\n", + "six 1.16.0\n", + "sniffio 1.3.0\n", + "soupsieve 2.3.2.post1\n", + "SQLAlchemy 2.0.15\n", + "stack-data 0.6.2\n", + "starlette 0.27.0\n", + "terminado 0.17.1\n", + "tinycss2 1.2.1\n", + "tokenizers 0.14.1\n", + "tomli 2.0.1\n", + "toolz 0.12.0\n", + "torch 1.13.1\n", + "torch-neuronx 1.13.1.1.11.0\n", + "torch-xla 1.13.1+torchneuronb\n", + "torchvision 0.14.1\n", + "tornado 6.3.2\n", + "tqdm 4.65.0\n", + "traitlets 5.9.0\n", + "transformers 4.34.1\n", + "transformers-neuronx 0.7.84\n", + "types-python-dateutil 2.8.19.14\n", + "typing_extensions 4.8.0\n", + "typing-utils 0.1.0\n", + "uri-template 1.3.0\n", + "uritemplate 3.0.1\n", + "urllib3 1.26.16\n", + "wcwidth 0.2.6\n", + "webcolors 1.13\n", + "webencodings 0.5.1\n", + "websocket-client 1.5.2\n", + "wget 3.2\n", + "wheel 0.40.0\n", + "widgetsnbextension 4.0.9\n", + "zipp 3.15.0\n", + "zstandard 0.19.0\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "!pip install --extra-index-url https://pip.repos.neuron.amazonaws.com transformers-neuronx\n", + "!pip install sentencepiece -U\n", + "!pip install --upgrade jupyter ipywidgets\n", + "!pip install fastapi\n", + "!pip install ray\n", + "%pip list" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "10b9b5ca-c66e-4ee1-adfc-5839f5939a0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aws-neuronx-runtime-discovery 2.9\n", + "libneuronxla 0.5.476\n", + "neuronx-cc 2.10.0.34+6c8792c6f\n", + "neuronx-hwm 2.10.0.5+7b1976adf\n", + "torch-neuronx 1.13.1.1.11.0\n", + "torch-xla 1.13.1+torchneuronb\n", + "transformers-neuronx 0.7.84\n" + ] + } + ], + "source": [ + "!pip list | grep neuron" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4c2b6c93-c142-4827-97d3-007823f32fff", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "87ff0882e2e74b3b8c0f2ebe8d396c70", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00 this is for trn1.32xl. For inf2 instances, adjust 'tp_degree' accordingly\n", + "# I am using inf2.24x hence changed the tp_degree to 12\n", + "neuron_model = LlamaForSampling.from_pretrained(model_split_name, batch_size=1, tp_degree=12, amp='f16')\n", + "neuron_model.to_neuron()\n", + "\n", + "# construct a tokenizer and encode prompt text\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "11b1c47c-9393-498f-a2e8-7925e49df2e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated sequences [' What is data parallelism and explain with more details?\\n\\nData parallelism is a programming strategy used in parallel computing, where a single process is divided into smaller sub-processes, each of which operates on a separate portion of the data. This allows for the processing of large datasets in parallel, leading to increased performance and scalability.\\n\\nIn traditional parallel processing, a single process is divided into multiple threads or tasks, each of which operates on a subset of the data. However, this can lead to contention and synchronization issues, as each thread or task may need to access the same data.\\n\\nData parallelism avoids these issues by ensuring that each sub-process operates on a separate portion of the data, minimizing the need for synchronization and contention. This makes it particularly useful for processing large datasets that do not fit in the memory of a single processor.\\n\\nThere are several key benefits to using data parallelism:\\n\\n1. Scalability: Data parallelism allows for the parallel processing of large datasets, leading to increased performance and scalability.\\n2. Flexibility: Data parallelism can be applied to a wide range of data types and processing tasks, making it a flexible programming strategy.\\n3. Efficiency: By minimizing the need for synchronization and contention, data parallelism can lead to more efficient processing of large datasets.\\n4. Cost-effectiveness: Data parallelism can be more cost-effective than traditional parallel processing strategies, as it does not require the use of specialized hardware or software.\\n\\nThere are several challenges associated with data parallelism:\\n\\n1. Data size and distribution: Ensuring that the data is evenly distributed across the sub-processes can be challenging, particularly for large datasets.\\n2. Communication overhead: Synchronizing the sub-processes can lead to communication overhead, which can impact performance.\\n3. Load balancing: Ensuring that each sub-process is evenly loaded can be a challenge, particularly if the data is not evenly distributed.\\n4. Data consistency: Ensuring that the data remains consistent across the sub-processes can be a challenge, particularly if the data is being updated simultaneously.\\n\\nTo overcome these challenges, several techniques can be used, such as:\\n\\n1. Data partitioning: Partitioning the data into smaller chunks can help to ensure that the data is evenly distributed across the sub-processes.\\n2. Data locality: Ensuring that the data is located on the same node or partition as the compute resources can help to reduce communication overhead.\\n3. Load balancing algorithms: Algorithms such as randomized block-size scheduling can help to ensure that each sub-process is evenly loaded.\\n4. Data consistency protocols: Protocols such as distributed locking or versioning can help to ensure that the data remains consistent across the sub-processes.\\n\\nExamples of data parallelism can be found in various fields, such as:\\n\\n1. Distributed databases: Distributed databases such as Apache Cassandra and Amazon DynamoDB use data parallelism to process large amounts of data in parallel.\\n2. Machine learning: Machine learning algorithms such as Apache Spark and TensorFlow use data parallelism to train models on large datasets.\\n3. Scientific simulations: Scientific simulations such as weather modeling and molecular dynamics use data parallelism to process large amounts of data in parallel.\\n4. Data processing: Data processing tasks such as data integration and data warehousing use data parallelism to process large amounts of data in parallel.'] in 19.84203338623047 seconds\n", + "\n" + ] + } + ], + "source": [ + "prompt = \"What is data parallelism and explain with more details\"\n", + "input_ids = tokenizer.encode(prompt, return_tensors=\"pt\")\n", + "\n", + "# run inference with top-k sampling\n", + "with torch.inference_mode():\n", + " start = time.time()\n", + " generated_sequences = neuron_model.sample(input_ids, sequence_length=2048, top_k=50)\n", + " elapsed = time.time() - start\n", + "\n", + "generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences]\n", + "print(f'generated sequences {generated_sequences} in {elapsed} seconds')\n", + "print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d76b37e-e9b3-41a9-8b40-d3adb8ea8f92", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/Dockerfile b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/Dockerfile new file mode 100644 index 000000000..bbd8a93a4 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/Dockerfile @@ -0,0 +1,34 @@ +# docker buildx build --platform=linux/amd64 -t ray-serve-llama2:latest . +# https://hub.docker.com/layers/rayproject/ray-ml/2.7.1-py310-gpu/images/sha256-f84ecfc82d255ff9e23b8e40343a95655ec8e23a009633a183769edac6277186?context=explore +FROM rayproject/ray:2.7.1-py310 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Set up the Neuron repository and install Neuron packages +RUN . /etc/os-release && \ + sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + sudo apt-get update -y && \ + sudo apt-get install aws-neuronx-dkms aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y && \ + sudo apt-get clean + +# Switch back to a non-root user for the subsequent commands +USER $USER + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install wget awscli regex neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx sentencepiece transformers + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH + +WORKDIR /serve_app + +COPY ray_serve_llama2.py /serve_app/ray_serve_llama2.py diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/README.md b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/README.md new file mode 100644 index 000000000..3497864c4 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/README.md @@ -0,0 +1,69 @@ +# How to deploy Llama2 on Inference2 and EKS + +## Pre-requisites +Deploy the `trainium-inferentia` blueprint using this [link](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/trainium) + +## Step 1: Deploy RayServe Cluster + +To deploy the RayServe cluster with `Llama2-13B` LLM on `Inf2.48xlarge` instance, run the following command: + +**IMPORTANT NOTE: RAY MODEL DEPLOYMENT CAN TAKE UPTO 8 TO 10 MINS** + +```bash +cd data-on-eks/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2 +kubectl apply -f ray-service-llama2.yaml +``` + +This will deploy a RayServe cluster with two `inf2.48xlarge` instances. The `Llama2-13B` LLM will be loaded on both instances and will be available to serve inference requests. + +Once the RayServe cluster is deployed, you can start sending inference requests to it. To do this, you can use the following steps: + +Get the NLB DNS Name address of the RayServe cluster. You can do this by running the following command: + +```bash +kubectl get ingress llama2-ingress -n llama2 +``` + +Now, you can access the Ray Dashboard from the URL Below + + http:///dashboard/#/serve + +## Step 2: To Test the Llama2 Model + +To test the Llama2 model, you can use the following command with a query added at the end of the URL. +This uses the GET method to get the response: + + http:///serve/infer?sentence=what is data parallelism and tensor parallelisma and the diffrences + + +You will see an output like this in your browser: + +```text +[ +"what is data parallelism and tensor parallelisma and the diffrences between them? + +Data parallelism and tensor parallelism are both techniques used to speed up machine learning training on large datasets using multiple GPUs or other parallel processing units. However, there are some key differences between them: + +Data parallelism: + +* In data parallelism, the same model is split across multiple devices (e.g., GPUs or Machines), and each device processes a portion of the input data. +* Each device computes the gradients for its own portion of the data and sends them back to the host, which aggregates them to update the model parameters. +* Data parallelism is useful for training large models on large datasets, as it allows the model to be partitioned across multiple devices and processed in parallel. +* Popular deep learning frameworks such as TensorFlow, PyTorch, and Keras support data parallelism. + +Tensor parallelism: + +* In tensor parallelism, multiple devices (e.g., GPUs or TPUs) are used to compute multiple tensors (i.e., matrices) in parallel. +* Each device computes a subset of the tensor operations, and the results are combined to form the final output. +* Tensor parallelism is useful for training large models on large datasets, as it allows the model to be computed in parallel across multiple devices. +* Popular deep learning frameworks such as TensorFlow and PyTorch support tensor parallelism. + +Key differences: + +* Data parallelism is focused on dividing the input data across multiple devices, while tensor parallelism is focused on dividing the computational operations across multiple devices. +* In data parallelism, each device processes a portion of the input data, while in tensor parallelism, each device computes a subset of the tensor operations. +* Data parallelism is typically used for training large models on large datasets, while tensor parallelism is typically used for training large models on large datasets with a large number of parameters. + +In summary, data parallelism is a technique for speeding up machine learning training by dividing the input data across multiple devices, while tensor parallelism is a technique for speeding up machine learning training by dividing the computational operations across multiple devices. Both techniques are useful for training large models on large datasets, but they have different focuses and are used in different situations." +] +``` diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml new file mode 100644 index 000000000..9fc6926e9 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml @@ -0,0 +1,135 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: llama2 + +--- +apiVersion: ray.io/v1alpha1 +kind: RayService +metadata: + name: llama2-service + namespace: llama2 +spec: + serviceUnhealthySecondThreshold: 900 + deploymentUnhealthySecondThreshold: 300 + serveConfig: + importPath: ray_serve_llama2:entrypoint # Specify the correct path to your Python script + runtimeEnv: | + env_vars: {"MODEL_ID": "NousResearch/Llama-2-13b-chat-hf"} # Replace with the appropriate model ID + + rayClusterConfig: + rayVersion: '2.7.1' + headGroupSpec: + serviceType: NodePort + headService: + metadata: + name: llama2-service + namespace: llama2 + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest # Image created using the Dockerfile attached in the folder + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: [ "/bin/sh","-c","ray stop" ] + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + resources: + limits: + cpu: 4 + memory: 20Gi + requests: + cpu: 4 + memory: 20Gi + nodeSelector: + provisioner: default + workload: rayhead + volumes: + - name: ray-logs + emptyDir: {} + + workerGroupSpecs: + - groupName: inf2-worker-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: [ "/bin/sh","-c","ray stop" ] + resources: + limits: + cpu: "180" + memory: "700G" + aws.amazon.com/neuron: "12" + requests: + cpu: "180" + memory: "700G" + aws.amazon.com/neuron: "12" + nodeSelector: + karpenter.sh/provisioner-name: inferentia-inf2 + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + + +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: llama2-ingress + namespace: llama2 + annotations: + nginx.ingress.kubernetes.io/rewrite-target: "/$1" +spec: + ingressClassName: nginx + rules: + - http: + paths: + # Ray Dashboard + - path: /dashboard/(.*) + pathType: ImplementationSpecific + backend: + service: + name: llama2-service + port: + number: 8265 + # Ray Serve + - path: /serve/(.*) + pathType: ImplementationSpecific + backend: + service: + name: llama2-service + port: + number: 8000 diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray_serve_llama2.py b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray_serve_llama2.py new file mode 100644 index 000000000..424120751 --- /dev/null +++ b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray_serve_llama2.py @@ -0,0 +1,78 @@ +from fastapi import FastAPI +from ray import serve +import torch +import os +from transformers import AutoTokenizer, AutoModelForCausalLM + +app = FastAPI() + +# Define the Llama model and related parameters +llm_model = "NousResearch/Llama-2-13b-chat-hf" +llm_model_split = "llama-2-13b-chat-hf-split" +neuron_cores = 24 # inf2.24xlarge 6 Neurons (12 Neuron cores) and inf2.48xlarge 12 Neurons (24 Neuron cores) + + +# Define the APIIngress class responsible for handling inference requests +@serve.deployment(num_replicas=1) +@serve.ingress(app) +class APIIngress: + def __init__(self, llama_model_handle): + self.handle = llama_model_handle + + # Define an endpoint for inference + @app.get("/infer") + async def infer(self, sentence: str): + # Asynchronously perform inference using the provided sentence + ref = await self.handle.infer.remote(sentence) + # Await the result of the asynchronous inference and return it + result = await ref + return result + + +# Define the LlamaModel class responsible for managing the Llama language model +# Increase the number of replicas for the LlamaModel deployment. +# This will allow Ray Serve to handle more concurrent requests. +@serve.deployment( + ray_actor_options={ + "resources": {"neuron_cores": neuron_cores}, + "runtime_env": {"env_vars": {"NEURON_CC_FLAGS": "-O1"}}, + }, + autoscaling_config={"min_replicas": 1, "max_replicas": 2}, +) +class LlamaModel: + def __init__(self): + from transformers_neuronx.llama.model import LlamaForSampling + from transformers_neuronx.module import save_pretrained_split + + # Check if the model split exists locally, and if not, download it + if not os.path.exists(llm_model_split): + print(f"Saving model split for {llm_model} to local path {llm_model_split}") + self.model = AutoModelForCausalLM.from_pretrained(llm_model) + save_pretrained_split(self.model, llm_model_split) + else: + print(f"Using existing model split {llm_model_split}") + + print(f"Loading and compiling model {llm_model_split} for Neuron") + # Load and compile the Neuron-optimized Llama model + self.neuron_model = LlamaForSampling.from_pretrained(llm_model_split, + batch_size=1, + tp_degree=neuron_cores, + amp='f16') + self.neuron_model.to_neuron() + self.tokenizer = AutoTokenizer.from_pretrained(llm_model) + + # Define the method for performing inference with the Llama model + def infer(self, sentence: str): + # Tokenize the input sentence and encode it + input_ids = self.tokenizer.encode(sentence, return_tensors="pt") + # Perform inference with Neuron-optimized model + with torch.inference_mode(): + generated_sequences = self.neuron_model.sample(input_ids, + sequence_length=2048, + top_k=50) + # Decode the generated sequences and return the results + return [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in generated_sequences] + + +# Create an entry point for the FastAPI application +entrypoint = APIIngress.bind(LlamaModel.bind()) diff --git a/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml b/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml new file mode 100644 index 000000000..22e48c7f9 --- /dev/null +++ b/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml @@ -0,0 +1,11 @@ +controller: + service: + externalTrafficPolicy: "Local" + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # Private Load Balancer can only be accessed within the VPC + targetPorts: + http: http + https: http diff --git a/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml b/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml new file mode 100644 index 000000000..1e726401c --- /dev/null +++ b/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml @@ -0,0 +1,139 @@ +hub: + extraConfig: + jupyterhub_config.py: |- + c.KubeSpawner.start_timeout = 1200 + +proxy: + https: + enabled: false + type: offload + service: + type: ClusterIP +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU inlcuding the image pull + profileList: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + karpenter.sh/provisioner-name: trainium-trn1 + hub.jupyter.org/node-purpose: user + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + # trn1.32xlarge | 16 Neurons (32 cores) | 512 GB Accelerator memory | 128 vCPus and 512 GiB + cpu_guarantee: 100 + mem_guarantee: 450G + cpu_limit: 120 + mem_limit: 500G + extra_resource_limits: + aws.amazon.com/neuron: "16" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + node_selector: + karpenter.sh/provisioner-name: inferentia-inf2 + hub.jupyter.org/node-purpose: user + tolerations: + - key: aws.amazon.com/neuroncore + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + cpu_guarantee: 90 # 96 vCPU for inf2.24x large + mem_guarantee: 300G # 384Gib for inf2.24x large + cpu_limit: 90 + mem_limit: 300G + extra_resource_limits: + aws.amazon.com/neuron: "6" # 12 NeuronCores , 384 GB Memory, vCPU 192, Mem 768 GB + cmd: null + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "home/{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + readOnly: false + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 + userPods: + nodeAffinity: + matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/trainium-inferentia/install.sh b/ai-ml/trainium-inferentia/install.sh index 0e7b7166d..b87db5117 100755 --- a/ai-ml/trainium-inferentia/install.sh +++ b/ai-ml/trainium-inferentia/install.sh @@ -7,7 +7,6 @@ terraform init || echo "\"terraform init\" failed" targets=( "module.vpc" "module.eks" - "module.eks_blueprints_addons" ) # Apply modules in sequence diff --git a/ai-ml/trainium-inferentia/jupyterhub.tf b/ai-ml/trainium-inferentia/jupyterhub.tf new file mode 100644 index 000000000..916c3dfb2 --- /dev/null +++ b/ai-ml/trainium-inferentia/jupyterhub.tf @@ -0,0 +1,161 @@ +#----------------------------------------------------------------------------------------- +# JupyterHub Single User IRSA, maybe that block could be incorporated in add-on registry +#----------------------------------------------------------------------------------------- +resource "kubernetes_namespace" "jupyterhub" { + metadata { + name = "jupyterhub" + } +} + +module "jupyterhub_single_user_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + + role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa" + + role_policy_arns = { + policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances. + } + + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["${kubernetes_namespace.jupyterhub.metadata[0].name}:jupyterhub-single-user"] + } + } +} + +resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa.iam_role_arn } + } + + automount_service_account_token = true +} + +resource "kubernetes_secret_v1" "jupyterhub_single_user" { + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user-secret" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub.metadata[0].name + } + } + + type = "kubernetes.io/service-account-token" +} + +#--------------------------------------------------------------- +# EFS Filesystem for private volumes per user +# This will be replaced with Dynamic EFS provision using EFS CSI Driver +#--------------------------------------------------------------- +resource "aws_efs_file_system" "efs" { + creation_token = "efs-jupyter-single-user" + encrypted = true + + tags = local.tags +} + +resource "aws_efs_mount_target" "efs_mt" { + count = length(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])) + + file_system_id = aws_efs_file_system.efs.id + subnet_id = element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), count.index) + security_groups = [aws_security_group.efs.id] +} + +resource "aws_security_group" "efs" { + name = "${local.name}-efs" + description = "Allow inbound NFS traffic from private subnets of the VPC" + vpc_id = module.vpc.vpc_id + + ingress { + description = "Allow NFS 2049/tcp" + cidr_blocks = module.vpc.vpc_secondary_cidr_blocks + from_port = 2049 + to_port = 2049 + protocol = "tcp" + } + + tags = local.tags +} + +resource "kubectl_manifest" "pv" { + yaml_body = < ## Requirements @@ -45,7 +45,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ |------|-------------|------|---------|:--------:| | [create\_iam\_service\_linked\_role\_es](#input\_create\_iam\_service\_linked\_role\_es) | Whether to create `AWSServiceRoleForAmazonOpensearchService` service-linked role. Set it to `false` if the role already exists | `bool` | `true` | no | | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.26"` | no | -| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `string` | `false` | no | +| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"datahub-on-eks"` | no | | [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 32766 Subnet1 and 16382 Subnet2 IPs per Subnet | `list(string)` |
[
"10.1.0.0/17",
"10.1.128.0/18"
]
| no | | [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet | `list(string)` |
[
"10.1.255.128/26",
"10.1.255.192/26"
]
| no | diff --git a/analytics/terraform/datahub-on-eks/datahub-addon/main.tf b/analytics/terraform/datahub-on-eks/datahub-addon/main.tf index f073877ba..6e19910ac 100644 --- a/analytics/terraform/datahub-on-eks/datahub-addon/main.tf +++ b/analytics/terraform/datahub-on-eks/datahub-addon/main.tf @@ -80,6 +80,45 @@ resource "kubernetes_secret" "datahub_rds_secret" { } } +resource "random_password" "auth_secrets" { + length = 32 + special = false + min_upper = 0 + min_lower = 1 + min_numeric = 1 +} + +resource "random_password" "auth_secrets_key" { + length = 32 + special = false + min_upper = 0 + min_lower = 1 + min_numeric = 1 +} + +resource "random_password" "auth_secrets_salt" { + length = 32 + special = false + min_upper = 0 + min_lower = 1 + min_numeric = 1 +} + +resource "kubernetes_secret" "datahub_auth_secrets" { + depends_on = [kubernetes_namespace.datahub] + metadata { + name = "datahub-auth-secrets" + namespace = local.datahub_namespace + } + + data = { + system_client_secret = random_password.auth_secrets.result + token_service_signing_key = random_password.auth_secrets_key.result + token_service_salt = random_password.auth_secrets_salt.result + } + +} + resource "helm_release" "prereq" { depends_on = [module.prereq] @@ -133,7 +172,7 @@ resource "helm_release" "prereq" { } resource "helm_release" "datahub" { - depends_on = [kubernetes_secret.datahub_es_secret, kubernetes_secret.datahub_rds_secret, helm_release.prereq] + depends_on = [kubernetes_secret.datahub_es_secret, kubernetes_secret.datahub_rds_secret, kubernetes_secret.datahub_auth_secrets, helm_release.prereq] name = try(var.datahub_helm_config["name"], local.datahub_name) repository = try(var.datahub_helm_config["repository"], local.datahub_repository) diff --git a/analytics/terraform/datahub-on-eks/datahub-addon/values/datahub_values.yaml b/analytics/terraform/datahub-on-eks/datahub-addon/values/datahub_values.yaml index a3d1234db..1d6c69df1 100644 --- a/analytics/terraform/datahub-on-eks/datahub-addon/values/datahub_values.yaml +++ b/analytics/terraform/datahub-on-eks/datahub-addon/values/datahub_values.yaml @@ -14,7 +14,7 @@ datahub-frontend: enabled: true annotations: kubernetes.io/ingress.class: alb - alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/scheme: internal # Private Load Balancer can only be accessed within the VPC alb.ingress.kubernetes.io/target-type: instance alb.ingress.kubernetes.io/inbound-cidrs: 0.0.0.0/0 alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]' @@ -50,6 +50,7 @@ elasticsearchSetupJob: enabled: true image: repository: linkedin/datahub-elasticsearch-setup + # tag: v0.8.44 podSecurityContext: fsGroup: 1000 securityContext: @@ -110,6 +111,12 @@ datahubUpgrade: # - name: my-image-name # image: my-image # imagePullPolicy: Always + annotations: + # This is what defines this resource as a hook. Without this line, the + # job is considered part of the release. + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-2" + helm.sh/hook-delete-policy: before-hook-creation cleanupJob: # Add extra sidecar containers to job pod extraSidecars: [] @@ -153,6 +160,12 @@ datahubSystemUpdate: # - name: my-image-name # image: my-image # imagePullPolicy: Always + annotations: + # This is what defines this resource as a hook. Without this line, the + # job is considered part of the release. + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-4" + helm.sh/hook-delete-policy: before-hook-creation # prometheus-kafka-exporter: # enabled: false @@ -238,13 +251,10 @@ global: # secretKey: postgres-password datahub: + version: v0.10.4 - # force older version until the OpenSearch issue with latest version is resolved - version: v0.8.45 - - # disable systemUpdate due to bug in creating auth-secrets systemUpdate: - enabled: false + enabled: true gms: port: "8080" @@ -267,14 +277,14 @@ global: managed_ingestion: enabled: true - defaultCliVersion: "0.10.3" + defaultCliVersion: "0.10.4" metadata_service_authentication: enabled: true systemClientId: "__datahub_system" systemClientSecret: secretRef: "datahub-auth-secrets" - secretKey: "token_service_signing_key" + secretKey: "system_client_secret" tokenService: signingKey: secretRef: "datahub-auth-secrets" @@ -282,10 +292,14 @@ global: salt: secretRef: "datahub-auth-secrets" secretKey: "token_service_salt" - # Set to false if you'd like to provide your own auth secrets + # Set to false to use existing auth secrets provisionSecrets: - enabled: true - autoGenerate: true + enabled: false + autoGenerate: false + annotations: + # added to force secret generation before system update + helm.sh/hook: "pre-install,pre-upgrade" + helm.sh/hook-weight: "-6" # Only specify if autoGenerate set to false # secretValues: # secret: diff --git a/analytics/terraform/datahub-on-eks/variables.tf b/analytics/terraform/datahub-on-eks/variables.tf index 5fa1f2453..2889cace8 100644 --- a/analytics/terraform/datahub-on-eks/variables.tf +++ b/analytics/terraform/datahub-on-eks/variables.tf @@ -31,7 +31,7 @@ variable "vpc_cidr" { variable "enable_vpc_endpoints" { description = "Enable VPC Endpoints" default = false - type = string + type = bool } # Only two Subnets for with low IP range for internet access diff --git a/analytics/terraform/datahub-on-eks/vpc.tf b/analytics/terraform/datahub-on-eks/vpc.tf index 604297cd0..422d40394 100644 --- a/analytics/terraform/datahub-on-eks/vpc.tf +++ b/analytics/terraform/datahub-on-eks/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Network Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/analytics/terraform/emr-eks-ack/README.md b/analytics/terraform/emr-eks-ack/README.md index 539dbf253..3c25c4395 100644 --- a/analytics/terraform/emr-eks-ack/README.md +++ b/analytics/terraform/emr-eks-ack/README.md @@ -1,7 +1,7 @@ # EMR on EKS with ACK Controller and Crossplane This pattern is used to deploy the EKS Cluster with EMR on EKS ACK Controllers and Crossplane. -Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/amazon-emr-on-eks/emr-eks-ack) to deploy this pattern and run sample tests. +Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/blueprints/amazon-emr-on-eks/emr-eks-ack) to deploy this pattern and run sample tests. ## Requirements diff --git a/analytics/terraform/emr-eks-ack/vpc.tf b/analytics/terraform/emr-eks-ack/vpc.tf index 16ec547d7..ab607bb1b 100644 --- a/analytics/terraform/emr-eks-ack/vpc.tf +++ b/analytics/terraform/emr-eks-ack/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # VPC and Subnets #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/analytics/terraform/emr-eks-fargate/README.md b/analytics/terraform/emr-eks-fargate/README.md index eea6b2675..d0dbbbf4d 100644 --- a/analytics/terraform/emr-eks-fargate/README.md +++ b/analytics/terraform/emr-eks-fargate/README.md @@ -1,6 +1,6 @@ # EMR on EKS with Apache YuniKorn Batch Scheduler -Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/amazon-emr-on-eks/emr-eks-fargate) to deploy this pattern and run sample tests. +Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/blueprints/amazon-emr-on-eks/emr-eks-fargate) to deploy this pattern and run sample tests. ## Requirements diff --git a/analytics/terraform/emr-eks-fargate/main.tf b/analytics/terraform/emr-eks-fargate/main.tf index 7d659dc9c..c0282389c 100644 --- a/analytics/terraform/emr-eks-fargate/main.tf +++ b/analytics/terraform/emr-eks-fargate/main.tf @@ -43,8 +43,9 @@ module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.15" - cluster_name = local.name - cluster_version = var.eks_cluster_version + cluster_name = local.name + cluster_version = var.eks_cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id diff --git a/analytics/terraform/emr-eks-fargate/vpc.tf b/analytics/terraform/emr-eks-fargate/vpc.tf index 1bc0a6c0f..bc7e385aa 100644 --- a/analytics/terraform/emr-eks-fargate/vpc.tf +++ b/analytics/terraform/emr-eks-fargate/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Network Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/analytics/terraform/emr-eks-karpenter/README.md b/analytics/terraform/emr-eks-karpenter/README.md index 3cb7d0ae7..e82035e8b 100644 --- a/analytics/terraform/emr-eks-karpenter/README.md +++ b/analytics/terraform/emr-eks-karpenter/README.md @@ -1,6 +1,6 @@ # Scaling EMR on EKS Spark Jobs with Karpenter Autoscaler -Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/amazon-emr-on-eks/emr-eks-karpenter) to deploy this pattern and run sample tests. +Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter) to deploy this pattern and run sample tests. ## Requirements @@ -55,6 +55,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | | [aws_security_group.fsx](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.spark_monitor](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.static_pv](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.storage_class](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | @@ -74,6 +75,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | | [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | | [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source | +| [kubectl_path_documents.spark_monitor](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source | ## Inputs @@ -84,7 +86,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [enable\_emr\_spark\_operator](#input\_enable\_emr\_spark\_operator) | Enable the Spark Operator to submit jobs with EMR Runtime | `bool` | `false` | no | | [enable\_fsx\_for\_lustre](#input\_enable\_fsx\_for\_lustre) | Deploys fsx for lustre addon, storage class and static FSx for Lustre filesystem for EMR | `bool` | `false` | no | | [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost add-on | `bool` | `true` | no | -| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `string` | `false` | no | +| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `false` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"emr-eks-karpenter"` | no | | [region](#input\_region) | Region | `string` | `"us-west-2"` | no | diff --git a/analytics/terraform/emr-eks-karpenter/addons.tf b/analytics/terraform/emr-eks-karpenter/addons.tf index a44da2852..0ce629704 100644 --- a/analytics/terraform/emr-eks-karpenter/addons.tf +++ b/analytics/terraform/emr-eks-karpenter/addons.tf @@ -86,6 +86,11 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_karpenter = true karpenter_enable_spot_termination = true + karpenter_node = { + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } karpenter = { repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password @@ -174,6 +179,16 @@ module "eks_blueprints_addons" { tags = local.tags } +data "kubectl_path_documents" "spark_monitor" { + pattern = "${path.module}/emr-grafana-dashboard/spark-monitor.yaml" +} + +resource "kubectl_manifest" "spark_monitor" { + for_each = toset(data.kubectl_path_documents.spark_monitor.documents) + yaml_body = each.value + + depends_on = [module.eks_blueprints_addons] +} #--------------------------------------------------------------- # Data on EKS Kubernetes Addons #--------------------------------------------------------------- diff --git a/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json b/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json index ecac6cd8e..191b5e429 100644 --- a/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json +++ b/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json @@ -1,4 +1,59 @@ { + "__inputs": [ + { + "name": "PrometheusDataSource", + "label": "Your Data Source", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.4.7" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "state-timeline", + "name": "State timeline", + "version": "" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -21,16 +76,15 @@ "description": "Monitors spark and EKS metrics using Prometheus in Kubernetes", "editable": true, "fiscalYearStartMonth": 0, - "gnetId": 15435, + "gnetId": 18387, "graphTooltip": 1, - "id": 3, - "iteration": 1645997690519, + "id": null, + "iteration": 1680137406399, "links": [], "liveNow": false, "panels": [ { - "collapsed": true, - "datasource": null, + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -38,1695 +92,1695 @@ "y": 0 }, "id": 150, - "panels": [ - { - "datasource": "${data_source}", - "description": "The phase state timeline of all job pods (driver, executor(s) and job-runner)\n\nIf this graph looks convoluted, try selecting less executors as it can only show so many pods at a time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "fixed" - }, - "custom": { - "fillOpacity": 50, - "lineWidth": 1 - }, - "mappings": [ - { - "options": { - "1": { - "color": "gray", - "index": 0, - "text": "Unknown" - }, - "2": { - "color": "blue", - "index": 1, - "text": "Running" - }, - "3": { - "color": "yellow", - "index": 2, - "text": "Pending" - }, - "4": { - "color": "green", - "index": 3, - "text": "Succeeded" - }, - "5": { - "color": "red", - "index": 4, - "text": "Failed" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "purple", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 1 + "panels": [], + "title": "Pod State Timelines", + "type": "row" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "description": "The phase state timeline of all job pods (driver, executor(s) and job-runner)\n\nIf this graph looks convoluted, try selecting less executors as it can only show so many pods at a time", + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" }, - "id": 152, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "list", - "placement": "right" - }, - "mergeValues": true, - "rowHeight": 0.7, - "showValue": "never", - "tooltip": { - "mode": "single" - } + "custom": { + "fillOpacity": 50, + "lineWidth": 1, + "spanNulls": false }, - "targets": [ - { - "exemplar": true, - "expr": "(\nscalar(kube_pod_status_phase{pod=\"$Pod\", phase=\"Failed\"}*5) + \nscalar(kube_pod_status_phase{pod=\"$Pod\", phase=\"Pending\"}*3) + \nscalar(kube_pod_status_phase{pod=\"$Pod\", phase=\"Running\"}*2) + \nscalar(kube_pod_status_phase{pod=\"$Pod\", phase=\"Succeeded\"}*4) + scalar(kube_pod_status_phase{pod=\"$Pod\", phase=\"Unknown\"}*1)\n)", - "hide": false, - "interval": "", - "legendFormat": "Driver", - "refId": "Driver" - }, - { - "exemplar": true, - "expr": "label_replace(\n(\n(\n(sum(kube_pod_status_phase{pod=~\"$executor\",phase=\"Pending\"}*3)by(pod)) + (sum(kube_pod_status_phase{pod=~\"$executor\",phase=\"Running\"}*2)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",phase=\"Succeeded\"}*4)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",phase=\"Unknown\"}*1)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",phase=\"Failed\"}*5)by(pod))\n)\nand on (pod)\nkube_pod_info{created_by_name=\"$Pod\"}\n),\"executor_id\",\"$1\",\"pod\", \".*-exec-(.*)\"\n)\n>0\n", - "hide": false, - "interval": "", - "legendFormat": "Executor: {{executor_id}}", - "refId": "Executors" - }, - { - "exemplar": true, - "expr": "scalar(kube_pod_status_phase{pod=\"$job_runner_pod\", phase=\"Failed\"}*5) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\", phase=\"Pending\"}*3) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\", phase=\"Running\"}*2) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\", phase=\"Succeeded\"}*4) + scalar(kube_pod_status_phase{pod=\"$job_runner_pod\", phase=\"Unknown\"}*1)", - "hide": false, - "interval": "", - "legendFormat": "JobRunner", - "refId": "A" - } - ], - "title": "Phase State Timeline", - "transformations": [ + "mappings": [ { - "id": "filterByValue", "options": { - "filters": [ - { - "config": { - "id": "isNull", - "options": {} - }, - "fieldName": "Driver" - } - ], - "match": "any", - "type": "exclude" - } + "1": { + "color": "gray", + "index": 0, + "text": "Unknown" + }, + "2": { + "color": "blue", + "index": 1, + "text": "Running" + }, + "3": { + "color": "yellow", + "index": 2, + "text": "Pending" + }, + "4": { + "color": "green", + "index": 3, + "text": "Succeeded" + }, + "5": { + "color": "red", + "index": 4, + "text": "Failed" + } + }, + "type": "value" } ], - "type": "state-timeline" - } - ], - "title": "Pod State Timelines", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, "gridPos": { - "h": 1, + "h": 10, "w": 24, "x": 0, "y": 1 }, - "id": 146, - "panels": [ + "id": 152, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "right" + }, + "mergeValues": true, + "rowHeight": 0.7, + "showValue": "never", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "fillOpacity": 70, - "lineWidth": 0 - }, - "decimals": 0, - "mappings": [], - "noValue": "Not Running", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 2 - }, - "id": 139, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "hidden", - "placement": "bottom" - }, - "mergeValues": true, - "rowHeight": 0.9, - "showValue": "never", - "tooltip": { - "mode": "single" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "(count(spark_info{emr_containers_amazonaws_com_virtual_cluster_id=\"$vc_id\"}>0)by(emr_containers_amazonaws_com_job_id))>0", - "interval": "", - "legendFormat": "Job: {{emr_containers_amazonaws_com_job_id}}", - "refId": "A" - } - ], - "title": "Jobs in Virtual Cluster", - "type": "state-timeline" + "exemplar": true, + "expr": "(\nscalar(kube_pod_status_phase{pod=\"$Pod\", container!=\"\", phase=\"Failed\"}*5) + \nscalar(kube_pod_status_phase{pod=\"$Pod\",container!=\"\", phase=\"Pending\"}*3) + \nscalar(kube_pod_status_phase{pod=\"$Pod\",container!=\"\", phase=\"Running\"}*2) + \nscalar(kube_pod_status_phase{pod=\"$Pod\",container!=\"\", phase=\"Succeeded\"}*4) + scalar(kube_pod_status_phase{pod=\"$Pod\",container!=\"\", phase=\"Unknown\"}*1)\n)", + "hide": false, + "interval": "", + "legendFormat": "Driver", + "refId": "Driver" }, { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "fillOpacity": 70, - "lineWidth": 0 - }, - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "EKS" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, + "exemplar": true, + "expr": "label_replace(\n(\n(\n(sum(kube_pod_status_phase{pod=~\"$executor\",container!=\"\",phase=\"Pending\"}*3)by(pod)) + (sum(kube_pod_status_phase{pod=~\"$executor\",container!=\"\",phase=\"Running\"}*2)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",container!=\"\",phase=\"Succeeded\"}*4)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",container!=\"\",phase=\"Unknown\"}*1)by(pod)) +\n(sum(kube_pod_status_phase{pod=~\"$executor\",container!=\"\",phase=\"Failed\"}*5)by(pod))\n)\nand on (pod)\nkube_pod_info{created_by_name=\"$Pod\"}\n),\"executor_id\",\"$1\",\"pod\", \".*-exec-(.*)\"\n)\n>0\n", + "hide": false, + "interval": "", + "legendFormat": "Executor: {{executor_id}}", + "refId": "Executors" + }, + { + "exemplar": true, + "expr": "scalar(kube_pod_status_phase{pod=\"$job_runner_pod\", container!=\"\", phase=\"Failed\"}*5) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\",container!=\"\", phase=\"Pending\"}*3) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\", container!=\"\", phase=\"Running\"}*2) + \nscalar(kube_pod_status_phase{pod=\"$job_runner_pod\",container!=\"\", phase=\"Succeeded\"}*4) + scalar(kube_pod_status_phase{pod=\"$job_runner_pod\", container!=\"\", phase=\"Unknown\"}*1)", + "hide": false, + "interval": "", + "legendFormat": "JobRunner", + "refId": "JobRunner" + } + ], + "title": "Phase State Timeline", + "transformations": [ + { + "id": "filterByValue", + "options": { + "filters": [ { - "matcher": { - "id": "byFrameRefID", - "options": "Spark" + "config": { + "id": "isNull", + "options": {} }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "rgba(110, 255, 48, 1)", - "mode": "fixed" - } - } - ] + "fieldName": "Driver" } - ] - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 2 - }, - "id": 115, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "hidden", - "placement": "bottom" - }, - "mergeValues": true, - "rowHeight": 0.9, - "showValue": "never", - "tooltip": { - "mode": "single" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "kube_pod_info{created_by_name=\"$job_id-spark-defaults\"}", - "hide": false, - "interval": "", - "legendFormat": "EKS Metrics", - "refId": "EKS" - }, - { - "exemplar": false, - "expr": "spark_info{emr_containers_amazonaws_com_job_id=\"$job_id\"}", - "hide": false, - "interval": "", - "legendFormat": "Spark Metrics", - "refId": "Spark" - }, - { - "exemplar": true, - "expr": "(count({name=~\".*$job_id.*\"}))+count({pod=\"$Pod\"})", - "hide": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Metric Ingestion", - "type": "state-timeline" + ], + "match": "any", + "type": "exclude" + } } ], - "title": "Metric Ingestion", - "type": "row" + "type": "state-timeline" }, { - "collapsed": true, - "datasource": null, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 2 + "y": 11 }, - "id": 42, - "panels": [ - { - "datasource": null, - "editable": true, - "error": false, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 3 + "id": 146, + "panels": [], + "title": "Metric Ingestion", + "type": "row" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "height": "1px", - "id": 34, - "isNew": true, - "links": [], - "options": { - "content": "# $Pod_ip", - "mode": "markdown" + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false }, - "pluginVersion": "8.2.5", - "repeat": "Pod", - "repeatDirection": "v", - "style": { - "font-size": "72pt" + "decimals": 0, + "mappings": [], + "noValue": "Not Running", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] }, - "title": "Driver IP Address", - "transparent": true, - "type": "text" + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 139, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "hidden", + "placement": "bottom" }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "never", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "cacheTimeout": null, - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" + "exemplar": true, + "expr": "(count(spark_info{emr_containers_amazonaws_com_virtual_cluster_id=\"$vc_id\"}>0)by(emr_containers_amazonaws_com_job_id))>0", + "interval": "", + "legendFormat": "Job: {{emr_containers_amazonaws_com_job_id}}", + "refId": "A" + } + ], + "title": "Jobs in Virtual Cluster", + "type": "state-timeline" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "noValue": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "EKS" }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 10, - "y": 3 + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] }, - "id": 98, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + { + "matcher": { + "id": "byFrameRefID", + "options": "Spark" }, - "text": {}, - "textMode": "auto" + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "rgba(110, 255, 48, 1)", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 115, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "table", + "placement": "right" + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "never", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(kube_pod_info{created_by_name=\"$Pod\",container!=\"\"}) by (created_by_name)", + "hide": false, + "interval": "", + "legendFormat": "Executor Progress", + "refId": "EKS" + }, + { + "exemplar": false, + "expr": "spark_info{emr_containers_amazonaws_com_job_id=\"$job_id\"}", + "hide": false, + "interval": "", + "legendFormat": "Driver Progress", + "refId": "Spark" + }, + { + "exemplar": true, + "expr": "(count({name=~\".*$job_id.*\"}))+count({pod=\"$Pod\"})", + "hide": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Metric Ingestion", + "type": "state-timeline" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 42, + "panels": [], + "title": "Driver Info", + "type": "row" + }, + { + "editable": true, + "error": false, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 19 + }, + "height": "1px", + "id": 34, + "isNew": true, + "links": [], + "options": { + "content": "# $Pod_ip", + "mode": "markdown" + }, + "pluginVersion": "8.4.7", + "repeat": "Pod", + "repeatDirection": "v", + "style": { + "font-size": "72pt" + }, + "title": "Driver IP Address", + "transparent": true, + "type": "text" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - "pluginVersion": "8.2.5", - "targets": [ - { - "exemplar": true, - "expr": "max(kube_pod_container_resource_limits{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"memory\"})", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 40 - }, + "mappings": [ { - "exemplar": true, - "expr": "kube_pod_container_resource_requests{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"memory\"}", - "hide": true, - "interval": "", - "legendFormat": "", - "refId": "B" + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } ], - "title": "Requested Driver Memory", - "transparent": true, - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 20, - "y": 3 - }, - "id": 99, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + { + "color": "red", + "value": 80 + } + ] }, - "pluginVersion": "8.2.5", - "targets": [ - { - "exemplar": true, - "expr": "max(kube_pod_container_resource_requests{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"cpu\"})", - "hide": false, - "interval": "", - "legendFormat": "", - "refId": "B" - }, - { - "exemplar": true, - "expr": "kube_pod_container_resource_limits{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"cpu\"}", - "format": "time_series", - "hide": true, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 40 - } + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 10, + "y": 19 + }, + "id": 98, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "title": "Requested Driver Cores", - "transparent": true, - "type": "stat" + "fields": "", + "values": false }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.4.7", + "targets": [ { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "fillOpacity": 70, - "lineWidth": 0 + "exemplar": true, + "expr": "max(kube_pod_container_resource_limits{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"memory\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 40 + }, + { + "exemplar": true, + "expr": "kube_pod_container_resource_requests{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"memory\"}", + "hide": true, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "title": "Requested Driver Memory", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ { - "matcher": { - "id": "byName", - "options": "Running" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] + "color": "green", + "value": null }, { - "matcher": { - "id": "byName", - "options": "Succeeded" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - }, + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 19 + }, + "id": 99, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.4.7", + "targets": [ + { + "exemplar": true, + "expr": "max(kube_pod_container_resource_requests{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"cpu\"})", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "exemplar": true, + "expr": "kube_pod_container_resource_limits{container=\"spark-kubernetes-driver\",pod=\"$Pod\",resource=\"cpu\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 40 + } + ], + "title": "Requested Driver Cores", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "matcher": { - "id": "byName", - "options": "Failed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] + "color": "green", + "value": null }, { - "matcher": { - "id": "byName", - "options": "Pending" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] - }, + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ { - "matcher": { - "id": "byName", - "options": "Unknown" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "purple", - "mode": "fixed" - } - } - ] + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } } ] }, - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 6 + { + "matcher": { + "id": "byName", + "options": "Succeeded" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] }, - "id": 95, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "hidden", - "placement": "bottom" + { + "matcher": { + "id": "byName", + "options": "Failed" }, - "mergeValues": true, - "rowHeight": 0.9, - "showValue": "never", - "tooltip": { - "mode": "single" - } + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "kube_pod_status_phase{pod=\"$Pod\"}>0", - "interval": "", - "legendFormat": "{{phase}}", - "refId": "A" - } - ], - "title": "Driver Status", - "transparent": true, - "type": "state-timeline" + { + "matcher": { + "id": "byName", + "options": "Pending" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unknown" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 95, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "hidden", + "placement": "bottom" + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "never", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ + { + "exemplar": true, + "expr": "kube_pod_status_phase{pod=\"$Pod\",container!=\"\"}>0", + "interval": "", + "legendFormat": "{{phase}}", + "refId": "A" } ], - "title": "Driver Info", - "type": "row" + "title": "Driver Status", + "transparent": true, + "type": "state-timeline" }, { - "collapsed": true, - "datasource": null, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 3 + "y": 27 }, "id": 45, - "panels": [ - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "rgba(217, 119, 192, 1)", - "mode": "fixed" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 100, - "gradientMode": "opacity", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "panels": [], + "title": "Driver CPU / Memory usage", + "type": "row" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgba(217, 119, 192, 1)", + "mode": "fixed" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "id": 25, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "last" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + { + "color": "red", + "value": 80 + } + ] }, - "pluginVersion": "8.0.5", - "repeat": "Pod", - "repeatDirection": "v", - "targets": [ - { - "exemplar": true, - "expr": "sum (container_memory_working_set_bytes{pod=\"$Pod\"})", - "format": "time_series", - "hide": true, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "$Pod", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "(\navg(container_memory_working_set_bytes{pod=\"$Pod\",container!=\"\",container!=\"POD\"}) /\navg(container_spec_memory_limit_bytes{pod=\"$Pod\",container!=\"\",container!=\"POD\"})\n)", - "hide": false, - "interval": "", - "legendFormat": "Memory", - "refId": "B" - } + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 25, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" ], - "timeFrom": null, - "timeShift": null, - "title": "Driver Memory Usage", - "type": "timeseries" + "displayMode": "table", + "placement": "bottom" }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "repeat": "Pod", + "repeatDirection": "v", + "targets": [ { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "blue", - "mode": "fixed" - }, - "custom": { - "axisLabel": "CPU Time", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 74, - "gradientMode": "opacity", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "id": 17, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "last" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"^$Pod$\"} [5m])) by (pod) ", - "format": "time_series", - "hide": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Used CPU : {{pod}}", - "metric": "container_cpu", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(container_cpu_usage_seconds_total{pod=~\"^$Pod$\", container_name!=\"POD\"}) by (container_name)", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "refId": "B" - }, - { - "expr": "sum(container_spec_cpu_shares{pod=\"$Pod\", container_name!=\"POD\"}) by (container_name) / 1000", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "Requested CPU", - "refId": "C" - }, - { - "exemplar": true, - "expr": "", - "hide": false, - "interval": "", - "legendFormat": "", - "refId": "D" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Driver CPU usage", - "type": "timeseries" + "exemplar": true, + "expr": "sum (container_memory_working_set_bytes{pod=\"$Pod\",container!=\"\"})", + "format": "time_series", + "hide": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Pod", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "(\navg(container_memory_working_set_bytes{pod=\"$Pod\",container!=\"\",container!=\"POD\"}) /\navg(kube_pod_container_resource_limits{pod=\"$Pod\",container!=\"\",container!=\"POD\",resource=\"memory\"})\n)", + "hide": false, + "interval": "", + "legendFormat": "Memory", + "refId": "B" } ], - "title": "Driver CPU / Memory usage", - "type": "row" + "title": "Driver Memory Usage", + "type": "timeseries" }, { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 4 + "datasource": { + "uid": "${PrometheusDataSource}" }, - "id": 46, - "panels": [ - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "custom": { + "axisLabel": "CPU Time", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 74, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 5 - }, - "id": 28, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "last" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + { + "color": "red", + "value": 80 + } + ] }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "sum (container_memory_working_set_bytes{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\"} and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod)", - "format": "time_series", - "hide": true, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "{{pod}} ", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "avg(\n(avg(container_memory_working_set_bytes{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\"})by (pod)) / on (pod)\n(avg(container_spec_memory_limit_bytes{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\"})by (pod))\nand on (pod) kube_pod_info{created_by_name=\"$Pod\"}\n) by (pod)", - "hide": false, - "interval": "", - "legendFormat": "", - "refId": "B" - } + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 17, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" ], - "timeFrom": null, - "timeShift": null, - "title": "Executor : Memory Usage", - "type": "timeseries" + "displayMode": "table", + "placement": "bottom" }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "CPU Time", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "stepAfter", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$Pod\",container!=\"\"} [5m])) by (pod) / 1000 * 100", + "format": "time_series", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Used CPU", + "metric": "container_cpu", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(container_cpu_usage_seconds_total{pod=\"$Pod\", container_name!=\"POD\",container!=\"\"}) by (container_name)", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "B" + }, + { + "expr": "sum(container_spec_cpu_shares{pod=\"$Pod\", container_name!=\"POD\",container!=\"\"}) by (container_name) / 1000", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "Requested CPU", + "refId": "C" + }, + { + "exemplar": true, + "expr": "", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "D" + } + ], + "title": "Driver CPU usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 46, + "panels": [], + "title": "Executor CPU /Memory Usage", + "type": "row" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 5 + { + "color": "red", + "value": 80 + } + ] }, - "id": 20, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "last" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 37 + }, + "id": 28, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ + { + "exemplar": true, + "expr": "sum (container_memory_working_set_bytes{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\"} and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod)", + "format": "time_series", + "hide": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{pod}} ", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "avg(\n(avg(container_memory_working_set_bytes{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\"})by (pod)) / (avg(kube_pod_container_resource_limits{pod!=\"\",pod=~\"$executor\",container!=\"POD\",container!=\"\",resource=\"memory\"})by (pod))\nand on (pod) kube_pod_info{created_by_name=\"$Pod\"}\n) by (pod)", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "title": "Executor : Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "CPU Time", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "pluginVersion": "8.0.5", - "repeat": null, - "repeatDirection": "h", - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$executor\",pod!=\"\"} [4m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod) ", - "format": "time_series", - "hide": false, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "{{ pod }}", - "metric": "container_cpu", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(container_cpu_usage_seconds_total{pod=~\"$executor\", container_name!=\"POD\"}) by (pod) / 1000", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "{{ pod }}", - "refId": "B" - }, - { - "expr": "sum(container_spec_cpu_shares{pod=~\"$executor\", container_name!=\"POD\"}) by (pod) / 1000", - "format": "time_series", - "hide": true, - "instant": false, - "intervalFactor": 1, - "legendFormat": "Requested {{pod}}", - "refId": "C" - } + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 37 + }, + "id": 20, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" ], - "timeFrom": null, - "timeShift": null, - "title": "Executor : CPU usage", - "type": "timeseries" + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "repeatDirection": "h", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$executor\",pod!=\"\"} [4m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod) / 1000 * 100", + "format": "time_series", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(container_cpu_usage_seconds_total{pod=~\"$executor\", container_name!=\"POD\"}) by (pod) / 1000", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{ pod }}", + "refId": "B" + }, + { + "expr": "sum(container_spec_cpu_shares{pod=~\"$executor\", container_name!=\"POD\"}) by (pod) / 1000", + "format": "time_series", + "hide": true, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Requested {{pod}}", + "refId": "C" } ], - "repeat": null, - "title": "Executor CPU /Memory Usage", - "type": "row" + "title": "Executor : CPU usage", + "type": "timeseries" }, { - "collapsed": true, - "datasource": null, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 5 + "y": 44 }, "id": 71, - "panels": [ - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 9, - "x": 0, - "y": 6 - }, - "id": 69, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "pluginVersion": "8.0.5", - "targets": [ - { - "expr": "sum(rate(container_network_receive_bytes_total{pod=~'$Pod'}[2m])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ pod }} Received", - "refId": "A" - }, - { - "expr": "sum(rate(container_network_transmit_bytes_total{pod=~'$Pod'}[2m])) by (pod)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ pod }} Transmitted", - "refId": "B" - }, - { - "expr": "", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "refId": "C" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Driver Network I/O", - "type": "timeseries" - }, - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 9, - "y": 6 - }, - "id": 72, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_network_transmit_bytes_total{pod=~'$executor',kubernetes_io_hostname=~\"$Node\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,kubernetes_io_hostname)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ pod }} {{kubernetes_io_hostname}} Transmitted", - "refId": "B" - }, - { - "expr": "", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "refId": "C" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Executor Network Transmitted", - "type": "timeseries" - }, - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 17, - "y": 6 - }, - "id": 73, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_network_receive_bytes_total{pod=~'$executor',pod!=\"pod\",kubernetes_io_hostname=~\"$Node\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,kubernetes_io_hostname)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ pod }} {{kubernetes_io_hostname}} Received", - "refId": "A" - }, - { - "expr": "", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "", - "refId": "B" - }, - { - "expr": "", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "refId": "C" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Executor Network Received", - "type": "timeseries" - } - ], + "panels": [], "title": "Network IO", "type": "row" }, { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 6 + "datasource": { + "uid": "${PrometheusDataSource}" }, - "id": 82, - "panels": [ - { - "datasource": "${data_source}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 7 - }, - "id": 93, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "pluginVersion": "8.0.5", - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_fs_reads_bytes_total{pod=~\"$Pod\"}[2m])) by (pod,device)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{pod }} - {{device}} Read", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(rate(container_fs_reads_bytes_total{pod=~\"$executor\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,device)", - "hide": false, - "interval": "", - "legendFormat": "{{pod }} - {{device}} Read", - "refId": "B" - } + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 0, + "y": 45 + }, + "id": 69, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max" ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Read Bytes", - "type": "timeseries" + "displayMode": "table", + "placement": "bottom" }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${data_source}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 92, - "legend": { - "avg": true, - "current": false, - "max": true, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true + "expr": "sum(rate(container_network_receive_bytes_total{pod=\"$Pod\"}[2m])) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ pod }} Received", + "refId": "A" + }, + { + "expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$Pod\"}[2m])) by (pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ pod }} Transmitted", + "refId": "B" + }, + { + "expr": "", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "C" + } + ], + "title": "Driver Network I/O", + "type": "timeseries" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "percentage": false, - "pluginVersion": "8.2.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(container_fs_writes_bytes_total{pod=~\"$executor\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,device)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{pod }} - {{device}} Write", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(rate(container_fs_writes_bytes_total{pod=~\"$Pod\"}[2m])) by (pod,device)", - "hide": false, - "interval": "", - "legendFormat": "", - "refId": "B" - } + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 9, + "y": 45 + }, + "id": 72, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max" ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk Writes Bytes", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_network_transmit_bytes_total{pod=~'$executor',node=~\"$Node\"}[2m]) and on (pod,node) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,node)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ pod }} {{node}} Transmitted", + "refId": "B" + }, + { + "expr": "", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "C" + } + ], + "title": "Executor Network Transmitted", + "type": "timeseries" + }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 17, + "y": 45 + }, + "id": 73, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max" ], - "yaxis": { - "align": false, - "alignLevel": null - } + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{pod=~'$executor',pod!=\"POD\",node=~\"$Node\"}[2m]) and on (pod,node) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,node)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ pod }} {{node}} Received", + "refId": "A" + }, + { + "expr": "", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + }, + { + "expr": "", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "C" } ], + "title": "Executor Network Received", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 53 + }, + "id": 82, + "panels": [], "title": "Disk IO", "type": "row" }, + { + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 93, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.5", + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_fs_reads_bytes_total{pod=~\"$Pod\"}[2m])) by (pod,device)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod }} - {{device}} Read", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(container_fs_reads_bytes_total{pod=~\"$executor\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,device)", + "hide": false, + "interval": "", + "legendFormat": "{{pod }} - {{device}} Read", + "refId": "B" + } + ], + "title": "Disk Read Bytes", + "type": "timeseries" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "${PrometheusDataSource}" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 54 + }, + "hiddenSeries": false, + "id": 92, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.4.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_fs_writes_bytes_total{pod=~\"$executor\"}[2m]) and on (pod) kube_pod_info{created_by_name=\"$Pod\"}) by (pod,device)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod }} - {{device}} Write", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(container_fs_writes_bytes_total{pod=~\"$Pod\"}[2m])) by (pod,device)", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk Writes Bytes", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 7 + "y": 63 }, "id": 110, "panels": [ { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -1766,8 +1820,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1803,17 +1856,17 @@ "targets": [ { "exemplar": true, - "expr": "((1-(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)\n/node_memory_MemTotal_bytes)\n\nand on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=~\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\"))", + "expr": "(1 - label_replace(((avg_over_time(node_memory_MemFree_bytes{container!=\"\"}[5m]) + avg_over_time(node_memory_Cached_bytes{container!=\"\"}[5m]) + avg_over_time(node_memory_Buffers_bytes{container!=\"\"}[5m])) / avg_over_time(node_memory_MemTotal_bytes{container!=\"\"}[5m])),\"host_ip\", \"$1\", \"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{created_by_name=\"$Pod\",container!=\"\"})", "interval": "", - "legendFormat": "{{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "(1-(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)\n/node_memory_MemTotal_bytes)\nand on (node) {node=\"$driver_node\"}", + "expr": "(1 - label_replace(((avg_over_time(node_memory_MemFree_bytes{container!=\"\"}[5m]) + avg_over_time(node_memory_Cached_bytes{container!=\"\"}[5m]) + avg_over_time(node_memory_Buffers_bytes{container!=\"\"}[5m])) / avg_over_time(node_memory_MemTotal_bytes{container!=\"\"}[5m])),\"host_ip\", \"$1\",\"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{pod=\"$Pod\",container!=\"\"})", "hide": false, "interval": "", - "legendFormat": "{{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "C" } ], @@ -1822,7 +1875,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -1861,8 +1916,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1898,17 +1952,17 @@ "targets": [ { "exemplar": true, - "expr": "avg(1 - sum(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]) ) by (cpu, node)) by (node)\nand on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=~\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\")", + "expr": "avg(1 - label_replace(sum(rate(node_cpu_seconds_total{mode=\"idle\",container!=\"\"}[5m]) ) by (cpu, instance), \"host_ip\", \"$1\",\"instance\", \"(.*):.*\")) by (host_ip) and on (host_ip) kube_pod_info{created_by_name=\"$Pod\"}", "interval": "", - "legendFormat": "{{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "sum(avg(1 - sum(rate(node_cpu_seconds_total{mode=\"idle\",node=\"$driver_node\"}[5m]) ) by (cpu, node)) by (node))by(node)", + "expr": "avg(1 - label_replace(sum(rate(node_cpu_seconds_total{mode=\"idle\",container!=\"\"}[5m]) ) by (cpu, instance), \"host_ip\", \"$1\",\"instance\", \"(.*):.*\")) by (host_ip) and on (host_ip) kube_pod_info{pod=\"$Pod\"}", "hide": false, "interval": "", - "legendFormat": "{{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "B" } ], @@ -1916,7 +1970,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "description": "", "fieldConfig": { "defaults": { @@ -1956,8 +2012,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1994,31 +2049,31 @@ "targets": [ { "exemplar": true, - "expr": "sort_desc(\nsum(\nrate(node_network_transmit_bytes_total[5m]) \nand on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=~\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\")\n)\nby (node)\n)", + "expr": "sort_desc(label_replace(sum(rate(node_network_transmit_bytes_total{container!=\"\"}[5m])) by (instance),\"host_ip\", \"$1\",\"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{created_by_name=\"$Pod\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "Node: {{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "C" }, { "exemplar": true, - "expr": "sort_desc(\nsum(\nrate(node_network_transmit_bytes_total{node=\"$driver_node\"}[5m]))by (node)\n)", + "expr": "sort_desc(label_replace(sum(rate(node_network_transmit_bytes_total{container!=\"\"}[5m])) by (instance),\"host_ip\", \"$1\",\"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{pod=\"$Pod\"})", "hide": false, "interval": "", - "legendFormat": "Node: {{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Node Network Transmitted", "transformations": [], "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "description": "", "fieldConfig": { "defaults": { @@ -2058,8 +2113,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2096,31 +2150,31 @@ "targets": [ { "exemplar": true, - "expr": "sort_desc(\nsum(\nrate(node_network_receive_bytes_total[5m]) \nand on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=~\"$Pod\",created_by_name!~\"$job_id-spark-defaults\"},\"node\",\"$1\",\"node\", \"(.+)\")\n)\nby (node)\n)", + "expr": "sort_desc(label_replace(sum(rate(node_network_receive_bytes_total{container!=\"\"}[5m])) by (instance),\"host_ip\", \"$1\",\"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{created_by_name=\"$Pod\"})", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "Node: {{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "C" }, { "exemplar": true, - "expr": "sort_desc(\nsum(\nrate(node_network_receive_bytes_total{node=\"$driver_node\"}[5m]))by (node)\n)", + "expr": "sort_desc(label_replace(sum(rate(node_network_receive_bytes_total{container!=\"\"}[5m])) by (instance),\"host_ip\", \"$1\",\"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{pod=\"$Pod\"})", "hide": false, "interval": "", - "legendFormat": "Node: {{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "Node Network Received", "transformations": [], "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2160,8 +2214,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2197,14 +2250,14 @@ "targets": [ { "exemplar": true, - "expr": "(count(kube_pod_info{created_by_name=\"$Pod\",node!=\"\"})by(node))", + "expr": "(count(kube_pod_info{created_by_name=\"$Pod\",node!=\"\",container!=\"\"})by(node))", "interval": "", "legendFormat": "Node: {{node}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "(count(kube_pod_info{pod=\"$Pod\",node!=\"\"})by(node))", + "expr": "(count(kube_pod_info{pod=\"$Pod\",node!=\"\",container!=\"\"})by(node))", "hide": false, "interval": "", "legendFormat": "Node: {{node}} - Driver", @@ -2215,7 +2268,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2254,8 +2309,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2289,17 +2343,17 @@ "targets": [ { "exemplar": true, - "expr": "(1-avg(node_filesystem_free_bytes/node_filesystem_size_bytes)by(node))\nand on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\")", + "expr": "label_replace((1-avg(node_filesystem_free_bytes{container!=\"\"}/node_filesystem_size_bytes{container!=\"\"})by(instance)),\"host_ip\", \"$1\", \"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{created_by_name=\"$Pod\",container!=\"\"}", "interval": "", - "legendFormat": "Node: {{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "(sum(node_filesystem_size_bytes{node=\"$driver_node\"} - node_filesystem_free_bytes{node=\"$driver_node\"})by(node))", + "expr": "label_replace((1-avg(node_filesystem_free_bytes{container!=\"\"}/node_filesystem_size_bytes{container!=\"\"}) by(instance)),\"host_ip\", \"$1\", \"instance\", \"(.*):.*\") and on (host_ip) kube_pod_info{pod=\"$Pod\",container!=\"\"}", "hide": true, "interval": "", - "legendFormat": "Node: {{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "B" } ], @@ -2307,7 +2361,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2347,8 +2403,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2382,17 +2437,17 @@ "targets": [ { "exemplar": true, - "expr": "(sum(rate(node_disk_read_bytes_total[5m]))by(node)) and on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\")", + "expr": "label_replace(sum(rate(node_disk_read_bytes_total{container!=\"\"}[5m]))by(instance),\"host_ip\",\"$1\",\"instance\", \"(.+):.*\") and on (host_ip) \nkube_pod_info{created_by_name=\"$Pod\",container!=\"\"}", "interval": "", - "legendFormat": "Node: {{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "(sum(rate(node_disk_read_bytes_total{node=\"$driver_node\"}[5m]))by(node))", + "expr": "label_replace(sum(rate(node_disk_read_bytes_total{container!=\"\"}[5m]))by(instance),\"host_ip\",\"$1\",\"instance\", \"(.+):.*\") and on (host_ip) \nkube_pod_info{pod=\"$Pod\",container!=\"\"}", "hide": false, "interval": "", - "legendFormat": "Node: {{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "B" } ], @@ -2400,7 +2455,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2440,8 +2497,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2475,17 +2531,17 @@ "targets": [ { "exemplar": true, - "expr": "(sum(rate(node_disk_written_bytes_total[5m]))by(node)) and on (node) \nlabel_replace(kube_pod_info{node!=\"\",created_by_name=\"$Pod\"},\"node\",\"$1\",\"node\", \"(.+)\")", + "expr": "label_replace(sum(rate(node_disk_written_bytes_total{container!=\"\"}[5m]))by(instance),\"host_ip\",\"$1\",\"instance\", \"(.+):.*\") and on (host_ip) \nkube_pod_info{created_by_name=\"$Pod\",container!=\"\"}", "interval": "", - "legendFormat": "Node: {{node}} - Executor", + "legendFormat": "Node: {{host_ip}} - Executor", "refId": "A" }, { "exemplar": true, - "expr": "(sum(rate(node_disk_written_bytes_total{node=\"$driver_node\"}[5m]))by(node))", + "expr": "label_replace(sum(rate(node_disk_written_bytes_total{container!=\"\"}[5m]))by(instance),\"host_ip\",\"$1\",\"instance\", \"(.+):.*\") and on (host_ip) \nkube_pod_info{pod=\"$Pod\",container!=\"\"}", "hide": false, "interval": "", - "legendFormat": "Node: {{node}} - Driver", + "legendFormat": "Node: {{host_ip}} - Driver", "refId": "B" } ], @@ -2498,17 +2554,18 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 8 + "y": 64 }, "id": 132, "panels": [ { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2548,8 +2605,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2669,7 +2725,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2708,8 +2766,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2752,7 +2809,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2791,8 +2850,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2835,7 +2893,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2874,8 +2934,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2918,7 +2977,9 @@ "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -2957,8 +3018,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3022,17 +3082,18 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 65 }, "id": 89, "panels": [ { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -3071,8 +3132,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3119,22 +3179,22 @@ }, { "exemplar": true, - "expr": "sum(rate(container_network_transmit_bytes_total{pod=~'.*shuffle.*',kubernetes_io_hostname=~\"$Node\"}[2m])) by (pod,kubernetes_io_hostname)", + "expr": "sum(rate(container_network_transmit_bytes_total{pod=~'.*shuffle.*',node=~\"$Node\"}[2m])) by (pod,node)", "format": "time_series", "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ pod }} {{kubernetes_io_hostname}} Transmitted", + "legendFormat": "{{ pod }} {{node}} Transmitted", "refId": "B" } ], - "timeFrom": null, - "timeShift": null, "title": "Spark Shuffle Data Written", "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -3173,8 +3233,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3221,12 +3280,12 @@ }, { "exemplar": true, - "expr": "sum(rate(container_network_read_bytes_total{pod=~'.*shuffle.*',kubernetes_io_hostname=~\"$Node\"}[2m])) by (pod,kubernetes_io_hostname)", + "expr": "sum(rate(container_network_read_bytes_total{pod=~'.*shuffle.*',node=~\"$Node\"}[2m])) by (pod,node)", "format": "time_series", "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ pod }} {{kubernetes_io_hostname}} Received", + "legendFormat": "{{ pod }} {{node}} Received", "refId": "A" }, { @@ -3237,8 +3296,6 @@ "refId": "C" } ], - "timeFrom": null, - "timeShift": null, "title": "Spark Shuffle Data Read", "type": "timeseries" } @@ -3248,17 +3305,18 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 66 }, "id": 49, "panels": [ { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -3297,8 +3355,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3336,7 +3393,7 @@ "targets": [ { "exemplar": true, - "expr": "metrics_executor_JVMHeapMemory_bytes{pod=\"$Pod\"}", + "expr": "sum(metrics_executor_JVMHeapMemory_bytes{pod=\"$Pod\"})by(executor_id)", "format": "time_series", "interval": "10s", "intervalFactor": 1, @@ -3346,23 +3403,23 @@ "step": 10 }, { - "expr": "jvm_memory_bytes_used{kubernetes_pod=~\"^$executor$\",area=\"heap\"}", + "expr": "jvm_memory_bytes_used{pod=~\"^$executor$\",area=\"heap\"}", "format": "time_series", "interval": "10s", "intervalFactor": 1, - "legendFormat": " {{ kubernetes_pod }} ", + "legendFormat": " {{ pod }} ", "metric": "network", "refId": "B", "step": 10 } ], - "timeFrom": null, - "timeShift": null, "title": "JVM Heap Memory", "type": "timeseries" }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -3401,8 +3458,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3450,19 +3506,17 @@ "step": 10 }, { - "expr": "jvm_memory_bytes_used{kubernetes_pod=~\"$executor\",area=\"nonheap\"}", + "expr": "jvm_memory_bytes_used{pod=~\"$executor\",area=\"nonheap\"}", "format": "time_series", "hide": true, "interval": "10s", "intervalFactor": 1, - "legendFormat": "{{kubernetes_pod }}", + "legendFormat": "{{pod}}", "metric": "network", "refId": "B", "step": 10 } ], - "timeFrom": null, - "timeShift": null, "title": "JVM Off Heap Memory", "type": "timeseries" } @@ -3472,12 +3526,11 @@ }, { "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 11 + "y": 67 }, "id": 76, "panels": [ @@ -3486,7 +3539,9 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "decimals": 0, "fill": 1, "fillGradient": 0, @@ -3526,7 +3581,7 @@ "targets": [ { "exemplar": true, - "expr": "count(kube_pod_info{created_by_name=~\"$Pod\"})", + "expr": "count(kube_pod_info{created_by_name=~\"$Pod\",container!=\"\"})", "format": "time_series", "instant": false, "interval": "", @@ -3536,9 +3591,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Executors Spawned", "tooltip": { "shared": true, @@ -3547,37 +3600,30 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "datasource": "${data_source}", + "datasource": { + "uid": "${PrometheusDataSource}" + }, "fieldConfig": { "defaults": { "color": { @@ -3616,8 +3662,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3669,8 +3714,6 @@ "refId": "B" } ], - "timeFrom": null, - "timeShift": null, "title": "Packets Dropped : Network", "type": "timeseries" } @@ -3680,7 +3723,7 @@ } ], "refresh": false, - "schemaVersion": 32, + "schemaVersion": 35, "style": "dark", "tags": [ "kubernetes", @@ -3695,11 +3738,9 @@ { "current": { "selected": false, - "text": "Prometheus ws-a292e34c-530a-4f85-b97c-75921297873e", - "value": "Prometheus ws-a292e34c-530a-4f85-b97c-75921297873e" + "text": "Your Data Source", + "value": "Your Data Source" }, - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "Data Source", @@ -3715,19 +3756,12 @@ }, { "allValue": ".*", - "current": { - "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "label_values(kube_node_info{},cluster_arn)", "description": "The Cluster ARN", - "error": null, "hide": 0, "includeAll": true, "label": "Cluster ARN", @@ -3745,16 +3779,11 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "i95ic4j9doq34jh4gvf4tunb1", - "value": "i95ic4j9doq34jh4gvf4tunb1" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "spark_info{cluster_arn=~\"$c_arn\"}", - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "Virtual Cluster ID", @@ -3772,16 +3801,11 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "00000002vps80mvoniq", - "value": "00000002vps80mvoniq" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "spark_info{}", - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "Job ID", @@ -3800,15 +3824,11 @@ }, { "allValue": ".*", - "current": { - "selected": false, - "text": "None", - "value": "" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "spark_info{emr_containers_amazonaws_com_job_id=\"$job_id\",emr_containers_amazonaws_com_virtual_cluster_id=\"$vc_id\"}", - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "Driver ID", @@ -3830,19 +3850,11 @@ }, { "allValue": ".*", - "current": { - "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "kube_pod_info{created_by_name=\"$Pod\"}", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Executor ID", @@ -3864,18 +3876,13 @@ }, { "allValue": ".*", - "current": { - "selected": false, - "text": "All", - "value": "$__all" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "kube_pod_info{created_by_name=\"$Pod\"}", - "description": null, - "error": null, "hide": 0, "includeAll": true, - "label": null, "multi": false, "name": "Node", "options": [], @@ -3893,19 +3900,13 @@ "useTags": false }, { - "allValue": null, - "current": { - "selected": false, - "text": "None", - "value": "" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "kube_pod_info{pod=\"$Pod\"}", - "description": null, - "error": null, "hide": 2, "includeAll": false, - "label": null, "multi": false, "name": "Pod_ip", "options": [], @@ -3923,28 +3924,18 @@ "useTags": false }, { - "allValue": null, - "current": { - "selected": false, - "text": [ - "None" - ], - "value": [ - "" - ] + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", - "definition": "kube_pod_status_phase{pod=~\"$Pod\"}", - "description": null, - "error": null, + "definition": "kube_pod_status_phase{pod=~\"$Pod\",container!=\"\"}", "hide": 2, "includeAll": false, - "label": null, "multi": true, "name": "phase", "options": [], "query": { - "query": "kube_pod_status_phase{pod=~\"$Pod\"}", + "query": "kube_pod_status_phase{pod=~\"$Pod\",container!=\"\"}", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -3956,24 +3947,18 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "None", - "value": "" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", - "definition": "kube_pod_info{pod=\"$Pod\",node!=\"\"}", - "description": null, - "error": null, + "definition": "kube_pod_info{pod=\"$Pod\"}", "hide": 2, "includeAll": false, - "label": null, "multi": false, "name": "driver_node", "options": [], "query": { - "query": "kube_pod_info{pod=\"$Pod\",node!=\"\"}", + "query": "kube_pod_info{pod=\"$Pod\"}", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -3983,16 +3968,11 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "prometheus", - "value": "prometheus" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "kube_namespace_labels", - "description": null, - "error": null, "hide": 2, "includeAll": false, "label": "Namespace", @@ -4010,19 +3990,13 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "3.1.1", - "value": "3.1.1" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "spark_info{}", - "description": null, - "error": null, "hide": 2, "includeAll": false, - "label": null, "multi": false, "name": "spark_version", "options": [], @@ -4037,19 +4011,13 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "emr-data-team-a", - "value": "emr-data-team-a" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "spark_info{}", - "description": null, - "error": null, "hide": 2, "includeAll": false, - "label": null, "multi": false, "name": "driver_ns", "options": [], @@ -4064,19 +4032,13 @@ "type": "query" }, { - "allValue": null, - "current": { - "selected": false, - "text": "00000002vps80mvoniq-bgjlb", - "value": "00000002vps80mvoniq-bgjlb" + "current": {}, + "datasource": { + "uid": "${PrometheusDataSource}" }, - "datasource": "${data_source}", "definition": "label_values(kube_pod_info{created_by_kind=\"Job\",created_by_name=\"$job_id\"},pod)", - "description": null, - "error": null, "hide": 2, "includeAll": false, - "label": null, "multi": false, "name": "job_runner_pod", "options": [], @@ -4093,8 +4055,8 @@ ] }, "time": { - "from": "now-6h", - "to": "now" + "from": "2023-02-17T08:00:00.000Z", + "to": "2023-02-23T07:59:59.000Z" }, "timepicker": { "refresh_intervals": [ @@ -4124,5 +4086,6 @@ "timezone": "browser", "title": "Spark - EMR On EKS", "uid": "E1-VXaK7z5", - "version": 3 + "version": 2, + "weekStart": "" } diff --git a/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/spark-monitor.yaml b/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/spark-monitor.yaml new file mode 100644 index 000000000..e9f87412a --- /dev/null +++ b/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/spark-monitor.yaml @@ -0,0 +1,36 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + labels: + prometheus: "true" + name: spark-driver-monitoring +spec: + jobLabel: spark-driver-monitoring + namespaceSelector: + matchNames: [emr-data-team-a,emr-data-team-b] + podMetricsEndpoints: + - port: "web-ui" + path: /metrics/executors/prometheus/ + selector: + matchLabels: + spark-role: driver + emr-containers.amazonaws.com/resource.type: job.run + + +---- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + prometheus: "true" + name: spark-service-monitoring +spec: + namespaceSelector: + matchNames: [emr-data-team-a,emr-data-team-b] + endpoints: + - port: "web-ui" + path: /metrics/driver/prometheus/ + selector: + matchLabels: + spark_role: driver + emr-containers.amazonaws.com/resource.type: job.run diff --git a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml new file mode 100644 index 000000000..e3e89c729 --- /dev/null +++ b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-1t.yaml @@ -0,0 +1,114 @@ +# NOTE: This example requires the following prerequisites before executing the jobs +# 1. Ensure spark-team-a name space exists +# 2. replace with your bucket name +# 3. Ensure you run "analytics/spark-k8s-operator/spark-samples/tpcds-benchmark-data-generation-1t.yaml" which generates 3 TB input data + +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: tpcds-benchmark-3tb + namespace: spark-team-a +spec: + type: Scala + mode: cluster + image: public.ecr.aws/data-on-eks/emr-on-eks-benchmark:3.1.2 + imagePullPolicy: IfNotPresent + sparkVersion: 3.1.2 + mainClass: com.amazonaws.eks.tpcds.BenchmarkSQL + mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar + arguments: + # TPC-DS data location + - "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned" + # results location + - "s3:///TPCDS-TEST-3T-RESULT" + # Path to kit in the docker image + - "/opt/tpcds-kit/tools" + # Data Format + - "parquet" + # Scale factor (in GB) + - "3000" # changed from 3000 to 100gb for demo + # Number of iterations + - "1" + # Optimize queries with hive tables + - "false" + # Filter queries, will run all if empty - "q98-v2.4,q99-v2.4,ss_max-v2.4,q95-v2.4" + - "" + # Logging set to WARN + - "true" + sparkConf: + "spark.network.timeout": "2000s" + "spark.executor.heartbeatInterval": "300s" + # AQE + "spark.sql.adaptive.enabled": "true" + "spark.sql.adaptive.localShuffleReader.enabled": "true" + "spark.sql.adaptive.coalescePartitions.enabled": "true" + "spark.sql.adaptive.skewJoin.enabled": "true" + # "spark.sql.adaptive.logLevel": "WARN" + # IRSA for S3 connection + "spark.kubernetes.executor.podNamePrefix": "benchmark-exec" + "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" + "spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2" + "spark.executor.defaultJavaOptions": "-verbose:gc -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70" + # Keep pods in a single AZ + # "spark.kubernetes.node.selector.topology.kubernetes.io/zone": "us-west-1b" + # "spark.kubernetes.node.selector.eks.amazonaws.com/capacityType": "ON_DEMAND" + # ----------------------------------------------------- + # This block is very critical when you get errors like + # Exception in thread \"main\" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred + # Caused by: java.net.SocketTimeoutException: timeout + # spark.kubernetes.local.dirs.tmpfs: "true" # More details here https://spark.apache.org/docs/latest/running-on-kubernetes.html#using-ram-for-local-storage + spark.kubernetes.submission.connectionTimeout: "120000" # milliseconds + spark.kubernetes.submission.requestTimeout: "120000" + spark.kubernetes.driver.connectionTimeout: "120000" + spark.kubernetes.driver.requestTimeout: "120000" + # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size + # ----------------------------------------------------- + volumes: + - name: spark-local-dir-1 + hostPath: + path: /local1 + driver: + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + cores: 4 + coreLimit: "4.1" + memory: "5g" + memoryOverhead: "1000" + serviceAccount: spark-team-a + nodeSelector: + provisioner: spark-compute-optimized + executor: + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + cores: 4 + coreLimit: "4.3" + memory: "6g" + memoryOverhead: "2g" + # 8 executors per node + instances: 47 # changed from 47 to 20 for demo + serviceAccount: spark-team-a + nodeSelector: + provisioner: spark-compute-optimized + + restartPolicy: + type: Never diff --git a/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml new file mode 100644 index 000000000..4b85df7e8 --- /dev/null +++ b/analytics/terraform/emr-eks-karpenter/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml @@ -0,0 +1,109 @@ +# NOTE: This example requires the following prerequisites before executing the jobs +# 1. Ensure spark-team-a name space exists +# 2. replace with your bucket name + +--- +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: tpcds-data-generation-3t + namespace: spark-team-a +spec: + type: Scala + mode: cluster + image: public.ecr.aws/data-on-eks/emr-on-eks-benchmark:3.1.2 + imagePullPolicy: IfNotPresent + sparkVersion: 3.1.2 + mainClass: com.amazonaws.eks.tpcds.DataGeneration + mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar + arguments: + # TPC-DS data location + - "s3a:///TPCDS-TEST-3T" + # Path to kit in the docker image + - "/opt/tpcds-kit/tools" + # Data Format + - "parquet" + # Scale factor (in GB) + - "3000" + # Generate data num partitions + - "200" + # Create the partitioned fact tables + - "true" + # Shuffle to get partitions coalesced into single files. + - "true" + # Logging set to WARN + - "true" + sparkConf: + "spark.network.timeout": "2000s" + "spark.executor.heartbeatInterval": "300s" + "spark.kubernetes.memoryOverheadFactor": "0.3" + "spark.sql.files.maxRecordsPerFile": "30000000" + "spark.serializer": "org.apache.spark.serializer.KryoSerializer" + # "spark.local.dir": "/data1" + + # S3 settings + "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" + "spark.hadoop.fs.s3a.fast.upload": "true" + "spark.hadoop.fs.s3a.path.style.access": "true" + "spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2" + "spark.kubernetes.executor.podNamePrefix": "oss-data-gen" + "spark.executor.defaultJavaOptions": "-verbose:gc -XX:+UseG1GC" + "spark.driver.defaultJavaOptions": "-XX:+UseG1GC" + # ----------------------------------------------------- + # This block is very critical when you get errors like + # Exception in thread \"main\" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred + # Caused by: java.net.SocketTimeoutException: timeout + # spark.kubernetes.local.dirs.tmpfs: "true" + spark.kubernetes.submission.connectionTimeout: "60000000" + spark.kubernetes.submission.requestTimeout: "60000000" + spark.kubernetes.driver.connectionTimeout: "60000000" + spark.kubernetes.driver.requestTimeout: "60000000" + # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size + # ----------------------------------------------------- + + restartPolicy: + type: Never + volumes: + - name: spark-local-dir-1 + hostPath: + path: /local1 + driver: + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + cores: 10 + coreLimit: "10.1" + memory: "10g" + serviceAccount: spark-team-a + nodeSelector: + provisioner: spark-compute-optimized + + executor: + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 + cores: 11 + coreLimit: "11.1" + memory: "15g" + # 3 executors per node 9 nodes + instances: 26 + serviceAccount: spark-team-a + nodeSelector: + provisioner: spark-compute-optimized diff --git a/analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml index 1d4a43e47..76a5b9825 100644 --- a/analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml @@ -27,7 +27,7 @@ spec: volumeMounts: - name: spark-local-dir-1 mountPath: /dynamic # FSx Scratch 1 filesystem for executors scratch space - command: ["sh", "-c", "chmod 777 /dynamic", "chown -hR +999:+1000 /dynamic"] + command: ["sh", "-c", "chmod 744 /dynamic", "chown -hR +999:+1000 /dynamic"] tolerations: - key: "spark-compute-optimized" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml index 5d63bcd61..e31b03a26 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/driver-pod-template.yaml @@ -11,8 +11,7 @@ spec: type: Directory nodeSelector: - NodeGroupType: "SparkComputeOptimized" -# topology.kubernetes.io/zone: "us-west-2a" + provisioner: spark-compute-optimized containers: - name: spark-kubernetes-driver # Don't change this name. EMR on EKS looking for this name @@ -21,7 +20,7 @@ spec: mountPath: /data1 readOnly: false - tolerations: - - key: "spark-compute-optimized" - operator: "Exists" - effect: "NoSchedule" + # tolerations: + # - key: "spark-compute-optimized" + # operator: "Exists" + # effect: "NoSchedule" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh index 9420770cd..888f7ece2 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/execute_emr_eks_job.sh @@ -1,16 +1,22 @@ #!/bin/bash read -p "Enter EMR Virtual Cluster AWS Region: " AWS_REGION -read -p "Enter the EMR Virtual Cluster ID: " EMR_VIRTUAL_CLUSTER_ID -read -p "Enter the EMR Execution Role ARN: " EMR_EXECUTION_ROLE_ARN -read -p "Enter the CloudWatch Log Group name: " CLOUDWATCH_LOG_GROUP +# read -p "Enter the EMR Virtual Cluster ID: " EMR_VIRTUAL_CLUSTER_ID +# read -p "Enter the EMR Execution Role ARN: " EMR_EXECUTION_ROLE_ARN +# read -p "Enter the CloudWatch Log Group name: " CLOUDWATCH_LOG_GROUP read -p "Enter the S3 Bucket for storing PySpark Scripts, Pod Templates and Input data. For e.g., s3://: " S3_BUCKET +cp ../../../terraform.tfstate . +EMR_VIRTUAL_CLUSTER_ID=$(terraform output -json emr_on_eks | jq -r '."data-team-a".virtual_cluster_id') +EMR_EXECUTION_ROLE_ARN=$(terraform output -json emr_on_eks | jq -r '."data-team-a".job_execution_role_arn') +CLOUDWATCH_LOG_GROUP=$(terraform output -json emr_on_eks | jq -r '."data-team-a".cloudwatch_log_group_name') +rm terraform.tfstate + #-------------------------------------------- # DEFAULT VARIABLES CAN BE MODIFIED #-------------------------------------------- JOB_NAME='taxidata' -EMR_EKS_RELEASE_LABEL="emr-6.8.0-latest" # Spark 3.2.1 +EMR_EKS_RELEASE_LABEL="emr-6.10.0-latest" # Spark 3.3.1 SPARK_JOB_S3_PATH="${S3_BUCKET}/${EMR_VIRTUAL_CLUSTER_ID}/${JOB_NAME}" SCRIPTS_S3_PATH="${SPARK_JOB_S3_PATH}/scripts" @@ -20,6 +26,7 @@ OUTPUT_DATA_S3_PATH="${SPARK_JOB_S3_PATH}/output" #-------------------------------------------- # Copy PySpark Scripts, Pod Templates and Input data to S3 bucket #-------------------------------------------- +echo ${SCRIPTS_S3_PATH} aws s3 sync "./" ${SCRIPTS_S3_PATH} #-------------------------------------------- @@ -32,7 +39,7 @@ mkdir -p "../input" wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "../input/yellow_tripdata_2022-0.parquet" # Making duplicate copies to increase the size of the data. -max=10 +max=100 for (( i=1; i <= $max; ++i )) do cp -rf "../input/yellow_tripdata_2022-0.parquet" "../input/yellow_tripdata_2022-${i}.parquet" @@ -57,7 +64,7 @@ aws emr-containers start-job-run \ "entryPointArguments": ["'"$INPUT_DATA_S3_PATH"'", "'"$OUTPUT_DATA_S3_PATH"'" ], - "sparkSubmitParameters": "--conf spark.executor.instances=2" + "sparkSubmitParameters": "--conf spark.executor.instances=10" } }' \ --configuration-overrides '{ @@ -72,8 +79,12 @@ aws emr-containers start-job-run \ "spark.kubernetes.driver.podTemplateFile":"'"$SCRIPTS_S3_PATH"'/driver-pod-template.yaml", "spark.kubernetes.executor.podTemplateFile":"'"$SCRIPTS_S3_PATH"'/executor-pod-template.yaml", "spark.local.dir":"/data1", - + "spark.kubernetes.submission.connectionTimeout": "60000000", + "spark.kubernetes.submission.requestTimeout": "60000000", + "spark.kubernetes.driver.connectionTimeout": "60000000", + "spark.kubernetes.driver.requestTimeout": "60000000", "spark.kubernetes.executor.podNamePrefix":"'"$JOB_NAME"'", + "spark.metrics.appStatusSource.enabled":"true", "spark.ui.prometheus.enabled":"true", "spark.executor.processTreeMetrics.enabled":"true", "spark.kubernetes.driver.annotation.prometheus.io/scrape":"true", diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml index 58cbc01db..13dfb5ed2 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-compute-provisioner/executor-pod-template.yaml @@ -12,7 +12,7 @@ spec: type: Directory nodeSelector: - NodeGroupType: "SparkComputeOptimized" + provisioner: spark-compute-optimized # topology.kubernetes.io/zone: "us-west-2a" containers: @@ -22,7 +22,7 @@ spec: mountPath: /data1 readOnly: false - tolerations: - - key: "spark-compute-optimized" - operator: "Exists" - effect: "NoSchedule" + # tolerations: + # - key: "spark-compute-optimized" + # operator: "Exists" + # effect: "NoSchedule" diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/execute_emr_eks_job.sh b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/execute_emr_eks_job.sh index 9420770cd..46b9248db 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/execute_emr_eks_job.sh +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-graviton-memory-provisioner/execute_emr_eks_job.sh @@ -74,6 +74,7 @@ aws emr-containers start-job-run \ "spark.local.dir":"/data1", "spark.kubernetes.executor.podNamePrefix":"'"$JOB_NAME"'", + "spark.metrics.appStatusSource.enabled":"true", "spark.ui.prometheus.enabled":"true", "spark.executor.processTreeMetrics.enabled":"true", "spark.kubernetes.driver.annotation.prometheus.io/scrape":"true", diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/execute_emr_eks_job.sh b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/execute_emr_eks_job.sh index 9420770cd..46b9248db 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/execute_emr_eks_job.sh +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-memory-provisioner/execute_emr_eks_job.sh @@ -74,6 +74,7 @@ aws emr-containers start-job-run \ "spark.local.dir":"/data1", "spark.kubernetes.executor.podNamePrefix":"'"$JOB_NAME"'", + "spark.metrics.appStatusSource.enabled":"true", "spark.ui.prometheus.enabled":"true", "spark.executor.processTreeMetrics.enabled":"true", "spark.kubernetes.driver.annotation.prometheus.io/scrape":"true", diff --git a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/execute_emr_eks_job.sh b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/execute_emr_eks_job.sh index 9420770cd..46b9248db 100755 --- a/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/execute_emr_eks_job.sh +++ b/analytics/terraform/emr-eks-karpenter/examples/nvme-ssd/karpenter-yunikorn-gangscheduling/execute_emr_eks_job.sh @@ -74,6 +74,7 @@ aws emr-containers start-job-run \ "spark.local.dir":"/data1", "spark.kubernetes.executor.podNamePrefix":"'"$JOB_NAME"'", + "spark.metrics.appStatusSource.enabled":"true", "spark.ui.prometheus.enabled":"true", "spark.executor.processTreeMetrics.enabled":"true", "spark.kubernetes.driver.annotation.prometheus.io/scrape":"true", diff --git a/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus-amp-enable.yaml b/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus-amp-enable.yaml index b54e77987..065ef52f6 100644 --- a/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus-amp-enable.yaml +++ b/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus-amp-enable.yaml @@ -5,6 +5,8 @@ prometheus: annotations: eks.amazonaws.com/role-arn: ${amp_irsa} prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false remoteWrite: - url: ${amp_remotewrite_url} sigv4: @@ -39,10 +41,6 @@ prometheus: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ @@ -59,10 +57,7 @@ prometheus: target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_service_name] - action: drop - regex: 'node-exporter' + target_label: service - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod @@ -86,7 +81,13 @@ prometheus: target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace - target_label: kubernetes_pod_name + target_label: pod + - source_labels: [__meta_kubernetes_pod_ip] + action: replace + target_label: pod_ip + - source_labels: [__meta_kubernetes_pod_host_ip] + action: replace + target_label: instance - job_name: kubecost honor_labels: true scrape_interval: 1m @@ -113,6 +114,7 @@ alertmanager: enabled: false grafana: + enabled: true additionalDataSources: - name: amazon-managed-prometheus type: prometheus @@ -122,7 +124,7 @@ grafana: jsonData: sigV4Auth: true sigV4Region: ${region} - enabled: true + sigV4AuthType: "default" defaultDashboardsEnabled: true # Adding Amazon Managed Prometheus datasource to Grafana config serviceAccount: diff --git a/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus.yaml b/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus.yaml index fcea9cb17..6ccc48e1f 100644 --- a/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus.yaml +++ b/analytics/terraform/emr-eks-karpenter/helm-values/kube-prometheus.yaml @@ -1,5 +1,7 @@ prometheus: prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false retention: 5h scrapeInterval: 30s evaluationInterval: 30s @@ -17,6 +19,60 @@ prometheus: storage: 50Gi # Scrape Cost metrics for Kubecost and Yunikorn add-ons additionalScrapeConfigs: + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_ip] + action: replace + target_label: pod_ip + - source_labels: [__meta_kubernetes_pod_host_ip] + action: replace + target_label: instance - job_name: kubecost honor_labels: true scrape_interval: 1m diff --git a/analytics/terraform/emr-eks-karpenter/helm-values/spark-operator-values.yaml b/analytics/terraform/emr-eks-karpenter/helm-values/spark-operator-values.yaml new file mode 100644 index 000000000..be0aa86c1 --- /dev/null +++ b/analytics/terraform/emr-eks-karpenter/helm-values/spark-operator-values.yaml @@ -0,0 +1,42 @@ +replicaCount: 1 + +webhook: + # -- Enable webhook server + enable: true + # -- Webhook service port + port: 8080 + +# -- Set this if running spark jobs in a different namespace than the operator +#sparkJobNamespace: "spark-team-a" + +# -- Operator concurrency, higher values might increase memory usage +controllerThreads: 10 + +# resources -- Pod resource requests and limits +# Note, that each job submission will spawn a JVM within the Spark Operator Pod using "/usr/local/openjdk-11/bin/java -Xmx128m". +# Kubernetes may kill these Java processes at will to enforce resource limits. When that happens, you will see the following error: +# 'failed to run spark-submit for SparkApplication [...]: signal: killed' - when this happens, you may want to increase memory limits. +resources: + limits: + cpu: 200m + memory: 1Gi + requests: + cpu: 100m + memory: 512Mi + +batchScheduler: + # -- Enable batch scheduler for spark jobs scheduling. If enabled, users can specify batch scheduler name in spark application + enable: true + +#------------------------------------ +# THIS WILL CREATE SERVICE AND INGRESS OBJECT FOR EACH SPARK APPLICATION +#------------------------------------ +#uiService: +## # -- Enable UI service creation for Spark application +# enable: true +### -- Ingress URL format. +### Requires the UI service to be enabled by setting `uiService.enable` to true. +### 1/ Enable ingressUrlFormat to create an Ingress object for each Spark Job submitted using Spark Operator +### 2/ This setup also requires ingres-nginx to be deployed with NLB as LB with IP based routing. +### 3. Enter the NLB DNS name or enter Custom Domain name from route53 below which points to the NLB +#ingressUrlFormat: '/{{$appName}}' diff --git a/analytics/terraform/emr-eks-karpenter/install.sh b/analytics/terraform/emr-eks-karpenter/install.sh index 3a863f993..b87db5117 100755 --- a/analytics/terraform/emr-eks-karpenter/install.sh +++ b/analytics/terraform/emr-eks-karpenter/install.sh @@ -7,10 +7,6 @@ terraform init || echo "\"terraform init\" failed" targets=( "module.vpc" "module.eks" - "module.ebs_csi_driver_irsa" - "module.eks_blueprints_addons" - "module.eks_data_addons" - "module.emr_containers" ) # Apply modules in sequence diff --git a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml index 7cc754c23..b3af9c0a4 100644 --- a/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml +++ b/analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml @@ -17,23 +17,18 @@ spec: values: ["spot", "on-demand"] - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered operator: In - values: ["c5d.large","c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk + values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk - key: "kubernetes.io/arch" operator: In values: ["amd64"] limits: resources: - cpu: 1000 + cpu: 2000 providerRef: name: spark-compute-optimized labels: type: karpenter provisioner: spark-compute-optimized - NodeGroupType: SparkComputeOptimized - taints: - - key: spark-compute-optimized - value: 'true' - effect: NoSchedule ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set --- @@ -99,7 +94,8 @@ spec: mkdir -p /local1 echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab mount -a - /usr/bin/chown -hR +999:+1000 /local1 + # NOTE: Update permissions on folder according to your needs and specific user group. This is just an example. + chmod 777 -R /local* fi --BOUNDARY-- diff --git a/analytics/terraform/emr-eks-karpenter/main.tf b/analytics/terraform/emr-eks-karpenter/main.tf index 78d4b34cf..7dc9afbd4 100644 --- a/analytics/terraform/emr-eks-karpenter/main.tf +++ b/analytics/terraform/emr-eks-karpenter/main.tf @@ -75,6 +75,7 @@ module "eks" { cluster_version = var.eks_cluster_version # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id diff --git a/analytics/terraform/emr-eks-karpenter/variables.tf b/analytics/terraform/emr-eks-karpenter/variables.tf index b14db61c0..58524f083 100644 --- a/analytics/terraform/emr-eks-karpenter/variables.tf +++ b/analytics/terraform/emr-eks-karpenter/variables.tf @@ -36,7 +36,7 @@ variable "secondary_cidr_blocks" { variable "enable_vpc_endpoints" { description = "Enable VPC Endpoints" - type = string + type = bool default = false } diff --git a/analytics/terraform/emr-eks-karpenter/vpc.tf b/analytics/terraform/emr-eks-karpenter/vpc.tf index e31903e36..c5c309595 100644 --- a/analytics/terraform/emr-eks-karpenter/vpc.tf +++ b/analytics/terraform/emr-eks-karpenter/vpc.tf @@ -13,6 +13,9 @@ locals { #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md index f11861b2b..b6d17499e 100644 --- a/analytics/terraform/spark-k8s-operator/README.md +++ b/analytics/terraform/spark-k8s-operator/README.md @@ -1,5 +1,5 @@ # Spark on K8s Operator with EKS -Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/data-analytics/spark-operator-yunikorn) to deploy this pattern and run sample tests. +Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/blueprints/data-analytics/spark-operator-yunikorn) to deploy this pattern and run sample tests. ## Requirements @@ -73,7 +73,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.26"` | no | | [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | -| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `string` | `false` | no | +| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `true` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"spark-operator-doeks"` | no | | [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
[
"10.1.1.0/24",
"10.1.2.0/24"
]
| no | diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t.yaml index 4d30b5706..4fbcb1d68 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t.yaml @@ -1,6 +1,6 @@ # NOTE: This example requires the following prerequisites before executing the jobs # 1. Ensure spark-team-a name space exists -# 2. replace with your bucket name +# 2. replace with your bucket name # 3. Ensure you run "analytics/spark-k8s-operator/spark-samples/tpcds-benchmark-data-generation-1t.yaml" which generates 3 TB input data --- @@ -12,22 +12,22 @@ metadata: spec: type: Scala mode: cluster - image: ghcr.io/aws-samples/emr-on-eks-benchmark:3.1.2 + image: public.ecr.aws/data-on-eks/emr-on-eks-benchmark:3.1.2 imagePullPolicy: IfNotPresent sparkVersion: 3.1.2 mainClass: com.amazonaws.eks.tpcds.BenchmarkSQL mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar arguments: # TPC-DS data location - - "s3a:///TPCDS-TEST-3T" + - "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned" # results location - - "s3a:///TPCDS-TEST-3T-RESULT" + - "s3:///TPCDS-TEST-3T-RESULT" # Path to kit in the docker image - "/opt/tpcds-kit/tools" # Data Format - "parquet" # Scale factor (in GB) - - "3000" + - "3000" # changed from 3000 to 100gb for demo # Number of iterations - "1" # Optimize queries with hive tables @@ -46,7 +46,7 @@ spec: "spark.sql.adaptive.skewJoin.enabled": "true" # "spark.sql.adaptive.logLevel": "WARN" # IRSA for S3 connection - "spark.kubernetes.executor.podNamePrefix": "oss-spark-tpcds-2g1000" + "spark.kubernetes.executor.podNamePrefix": "benchmark-exec" "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" "spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2" @@ -54,49 +54,68 @@ spec: # Keep pods in a single AZ # "spark.kubernetes.node.selector.topology.kubernetes.io/zone": "us-west-1b" # "spark.kubernetes.node.selector.eks.amazonaws.com/capacityType": "ON_DEMAND" + # ----------------------------------------------------- + # This block is very critical when you get errors like + # Exception in thread \"main\" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred + # Caused by: java.net.SocketTimeoutException: timeout + # spark.kubernetes.local.dirs.tmpfs: "true" # More details here https://spark.apache.org/docs/latest/running-on-kubernetes.html#using-ram-for-local-storage + spark.kubernetes.submission.connectionTimeout: "120000" # milliseconds + spark.kubernetes.submission.requestTimeout: "120000" + spark.kubernetes.driver.connectionTimeout: "120000" + spark.kubernetes.driver.requestTimeout: "120000" + # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size + # ----------------------------------------------------- + volumes: + - name: spark-local-dir-1 + hostPath: + path: /local1 driver: - volumeMounts: # Points to InstanceStore NVMe SSD for shuffle spill over from memory + volumeMounts: - name: spark-local-dir-1 - mountPath: /data1 + mountPath: /ossdata1 readOnly: false initContainers: - - name: volume-permissions - image: public.ecr.aws/y4g4v0z7/busybox - command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ] - volumeMounts: - - mountPath: "/mnt/k8s-disks" - name: "spark-local-dir-1" + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 cores: 4 coreLimit: "4.1" memory: "5g" memoryOverhead: "1000" serviceAccount: spark-team-a nodeSelector: - NodeGroupType: "SparkMemoryOptimized" - karpenter.sh/capacity-type: on-demand + provisioner: spark-compute-optimized tolerations: - - key: "spark-memory-optimized" + - key: "spark-compute-optimized" operator: "Exists" effect: "NoSchedule" executor: volumeMounts: - name: spark-local-dir-1 - mountPath: /data1 + mountPath: /ossdata1 readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 cores: 4 coreLimit: "4.3" memory: "6g" memoryOverhead: "2g" # 8 executors per node - instances: 47 + instances: 47 # changed from 47 to 20 for demo serviceAccount: spark-team-a nodeSelector: - NodeGroupType: "SparkMemoryOptimized" - karpenter.sh/capacity-type: on-demand + provisioner: spark-compute-optimized tolerations: - - key: "spark-memory-optimized" + - key: "spark-compute-optimized" operator: "Exists" effect: "NoSchedule" - restartPolicy: type: Never diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml index 4ff304f8b..576850a85 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml @@ -1,6 +1,6 @@ # NOTE: This example requires the following prerequisites before executing the jobs # 1. Ensure spark-team-a name space exists -# 2. replace with your bucket name +# 2. replace with your bucket name --- apiVersion: "sparkoperator.k8s.io/v1beta2" @@ -11,14 +11,14 @@ metadata: spec: type: Scala mode: cluster - image: ghcr.io/aws-samples/emr-on-eks-benchmark:3.1.2 + image: public.ecr.aws/data-on-eks/emr-on-eks-benchmark:3.1.2 imagePullPolicy: IfNotPresent sparkVersion: 3.1.2 mainClass: com.amazonaws.eks.tpcds.DataGeneration mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar arguments: # TPC-DS data location - - "s3a:///TPCDS-TEST-3T" + - "s3a:///TPCDS-TEST-3T" # Path to kit in the docker image - "/opt/tpcds-kit/tools" # Data Format @@ -39,6 +39,7 @@ spec: "spark.kubernetes.memoryOverheadFactor": "0.3" "spark.sql.files.maxRecordsPerFile": "30000000" "spark.serializer": "org.apache.spark.serializer.KryoSerializer" + # "spark.local.dir": "/data1" # S3 settings "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" @@ -49,41 +50,58 @@ spec: "spark.kubernetes.executor.podNamePrefix": "oss-data-gen" "spark.executor.defaultJavaOptions": "-verbose:gc -XX:+UseG1GC" "spark.driver.defaultJavaOptions": "-XX:+UseG1GC" + # ----------------------------------------------------- + # This block is very critical when you get errors like + # Exception in thread \"main\" io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred + # Caused by: java.net.SocketTimeoutException: timeout + # spark.kubernetes.local.dirs.tmpfs: "true" + spark.kubernetes.submission.connectionTimeout: "60000000" + spark.kubernetes.submission.requestTimeout: "60000000" + spark.kubernetes.driver.connectionTimeout: "60000000" + spark.kubernetes.driver.requestTimeout: "60000000" + # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size + # ----------------------------------------------------- restartPolicy: - type: never - volumes: # using NVMe instance storage mounted on /mnt/k8s-disks + type: Never + volumes: - name: spark-local-dir-1 hostPath: - path: /mnt/k8s-disks - type: Directory + path: /local1 driver: - volumeMounts: # Points to InstanceStore NVMe SSD for shuffle spill over from memory + volumeMounts: - name: spark-local-dir-1 - mountPath: /data1 + mountPath: /ossdata1 readOnly: false initContainers: - - name: volume-permissions - image: public.ecr.aws/y4g4v0z7/busybox - command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ] - volumeMounts: - - mountPath: "/mnt/k8s-disks" - name: "spark-local-dir-1" + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 cores: 10 coreLimit: "10.1" memory: "10g" serviceAccount: spark-team-a nodeSelector: - NodeGroupType: "SparkMemoryOptimized" + provisioner: spark-compute-optimized tolerations: - - key: "spark-memory-optimized" + - key: "spark-compute-optimized" operator: "Exists" effect: "NoSchedule" executor: volumeMounts: - name: spark-local-dir-1 - mountPath: /data1 + mountPath: /ossdata1 readOnly: false + initContainers: + - name: volume-permission + image: public.ecr.aws/y4g4v0z7/busybox + command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1'] + volumeMounts: + - name: spark-local-dir-1 + mountPath: /ossdata1 cores: 11 coreLimit: "11.1" memory: "15g" @@ -91,8 +109,8 @@ spec: instances: 26 serviceAccount: spark-team-a nodeSelector: - NodeGroupType: "SparkMemoryOptimized" + provisioner: spark-compute-optimized tolerations: - - key: "spark-memory-optimized" + - key: "spark-compute-optimized" operator: "Exists" effect: "NoSchedule" diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh index 31432838b..4ebf25143 100644 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh @@ -11,7 +11,7 @@ # Script usage ./taxi-trip-execute my-s3-bucket us-west-2 -if [ $# -ne 3 ]; then +if [ $# -ne 2 ]; then echo "Usage: $0 " exit 1 fi diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh index 0047ae9da..b3a9dcfb5 100755 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh @@ -11,7 +11,7 @@ # Script usage ./taxi-trip-execute my-s3-bucket us-west-2 -if [ $# -ne 3 ]; then +if [ $# -ne 2 ]; then echo "Usage: $0 " exit 1 fi diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh index ac6d8ceb6..25707e596 100755 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh @@ -11,7 +11,7 @@ # Script usage ./taxi-trip-execute my-s3-bucket us-west-2 -if [ $# -ne 3 ]; then +if [ $# -ne 2 ]; then echo "Usage: $0 " exit 1 fi diff --git a/analytics/terraform/spark-k8s-operator/helm-values/nginx-values.yaml b/analytics/terraform/spark-k8s-operator/helm-values/nginx-values.yaml index 59a342709..c129611fa 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/nginx-values.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/nginx-values.yaml @@ -4,7 +4,7 @@ controller: annotations: service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 # service.beta.kubernetes.io/aws-load-balancer-scheme: internal # PRIVATE NLB - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing # PUBLIC NLB + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC # PUBLIC NLB service.beta.kubernetes.io/aws-load-balancer-type: external service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip # service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: "*" # DONT USE THIS EVER WHEN YOU USE IP based routing diff --git a/analytics/terraform/spark-k8s-operator/install.sh b/analytics/terraform/spark-k8s-operator/install.sh index 36ad742b3..18f2a94d3 100755 --- a/analytics/terraform/spark-k8s-operator/install.sh +++ b/analytics/terraform/spark-k8s-operator/install.sh @@ -7,8 +7,6 @@ export AWS_DEFAULT_REGION=$region targets=( "module.vpc" "module.eks" - "module.eks_blueprints_addons" - "module.eks_data_addons" ) # Initialize Terraform diff --git a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml index 1e1364e4a..b93858da8 100644 --- a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml +++ b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml @@ -17,13 +17,13 @@ spec: values: ["spot", "on-demand"] - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered operator: In - values: ["c5d.large","c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk + values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk - key: "kubernetes.io/arch" operator: In values: ["amd64"] limits: resources: - cpu: 1000 + cpu: 2000 providerRef: name: spark-compute-optimized labels: @@ -68,17 +68,40 @@ spec: --BOUNDARY Content-Type: text/x-shellscript; charset="us-ascii" - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh + #!/bin/bash + echo "Running a custom user data script" + set -ex + yum install mdadm -y - # Configure NVMe volumes in RAID0 configuration - # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 - # Mount will be: /mnt/k8s-disks - export LOCAL_DISKS='raid0' - EOF + DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + DISK_ARRAY=() + + for DEV in $DEVICES + do + DISK_ARRAY+=("/dev/$${DEV}") + done + + DISK_COUNT=$${#DISK_ARRAY[@]} + + if [ $${DISK_COUNT} -eq 0 ]; then + echo "No SSD disks available. No further action needed." + else + if [ $${DISK_COUNT} -eq 1 ]; then + TARGET_DEV=$${DISK_ARRAY[0]} + mkfs.xfs $${TARGET_DEV} + else + mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} + mkfs.xfs /dev/md0 + TARGET_DEV=/dev/md0 + fi + + mkdir -p /local1 + echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab + mount -a + # NOTE: Update permissions on folder according to your needs and specific user group. This is just an example. + chmod 777 -R /local* + fi --BOUNDARY-- diff --git a/analytics/terraform/spark-k8s-operator/main.tf b/analytics/terraform/spark-k8s-operator/main.tf index aff810f7e..7753dbb96 100755 --- a/analytics/terraform/spark-k8s-operator/main.tf +++ b/analytics/terraform/spark-k8s-operator/main.tf @@ -22,6 +22,7 @@ module "eks" { cluster_name = local.name cluster_version = var.eks_cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf index 77e6be1bd..cf808d7d8 100644 --- a/analytics/terraform/spark-k8s-operator/variables.tf +++ b/analytics/terraform/spark-k8s-operator/variables.tf @@ -57,7 +57,7 @@ variable "eks_data_plane_subnet_secondary_cidr" { variable "enable_vpc_endpoints" { description = "Enable VPC Endpoints" default = false - type = string + type = bool } variable "enable_amazon_prometheus" { diff --git a/analytics/terraform/spark-k8s-operator/vpc.tf b/analytics/terraform/spark-k8s-operator/vpc.tf index 4664cde51..21762f37c 100644 --- a/analytics/terraform/spark-k8s-operator/vpc.tf +++ b/analytics/terraform/spark-k8s-operator/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Network Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/distributed-databases/cloudnative-postgres/README.md b/distributed-databases/cloudnative-postgres/README.md index a66106977..1ba07da8e 100644 --- a/distributed-databases/cloudnative-postgres/README.md +++ b/distributed-databases/cloudnative-postgres/README.md @@ -18,8 +18,8 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Version | |------|---------| | [aws](#provider\_aws) | >= 3.72 | -| [helm](#provider\_helm) | >= 2.4.1 | | [kubectl](#provider\_kubectl) | >= 1.14 | +| [kubernetes](#provider\_kubernetes) | >= 2.10 | | [random](#provider\_random) | 3.4.3 | ## Modules @@ -27,32 +27,39 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Source | Version | |------|--------|---------| | [barman\_backup\_irsa](#module\_barman\_backup\_irsa) | github.com/aws-ia/terraform-aws-eks-blueprints-addons | ed27abc//modules/irsa | -| [barman\_s3\_bucket](#module\_barman\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | +| [barman\_s3\_bucket](#module\_barman\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.8 | | [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.14 | | [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | -| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | github.com/aws-ia/terraform-aws-eks-blueprints-addons | 08650f | +| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.0 | +| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.0 | | [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | ## Resources | Name | Type | |------|------| -| [aws_iam_policy.cnpg_buckup_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | -| [helm_release.cloudnative_pg](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | +| [aws_iam_policy.irsa_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | +| [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | | [kubectl_manifest.cnpg_grafana_cm](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.cnpg_prometheus_rule](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource | +| [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | +| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/3.4.3/docs/resources/password) | resource | | [random_string.random](https://registry.terraform.io/providers/hashicorp/random/3.4.3/docs/resources/string) | resource | | [aws_ami.eks](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | -| [aws_iam_policy_document.cnpg_backup](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.irsa_backup_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.25"` | no | -| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"cnpg-on-eks"` | no | +| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"cnpg"` | no | | [region](#input\_region) | Region | `string` | `"us-west-2"` | no | | [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.1.0.0/16"` | no | diff --git a/distributed-databases/cloudnative-postgres/addons.tf b/distributed-databases/cloudnative-postgres/addons.tf index 9fba3fd4f..053dd8949 100644 --- a/distributed-databases/cloudnative-postgres/addons.tf +++ b/distributed-databases/cloudnative-postgres/addons.tf @@ -1,10 +1,48 @@ +#--------------------------------------------------------------- +# GP3 Encrypted Storage Class +#--------------------------------------------------------------- +resource "kubernetes_annotations" "gp2_default" { + annotations = { + "storageclass.kubernetes.io/is-default-class" : "false" + } + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + metadata { + name = "gp2" + } + force = true + + depends_on = [module.eks] +} + +resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { + metadata { + name = "gp3" + annotations = { + "storageclass.kubernetes.io/is-default-class" : "true" + } + } + + storage_provisioner = "ebs.csi.aws.com" + reclaim_policy = "Delete" + allow_volume_expansion = true + volume_binding_mode = "WaitForFirstConsumer" + parameters = { + fsType = "xfs" + encrypted = true + type = "gp3" + } + + depends_on = [kubernetes_annotations.gp2_default] +} + module "eks_blueprints_addons" { - source = "github.com/aws-ia/terraform-aws-eks-blueprints-addons?ref=08650f" + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.0" #ensure to update this to the latest/desired version cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint cluster_version = module.eks.cluster_version - oidc_provider = module.eks.oidc_provider oidc_provider_arn = module.eks.oidc_provider_arn #--------------------------------------- @@ -24,16 +62,64 @@ module "eks_blueprints_addons" { preserve = true } } + enable_kube_prometheus_stack = true - kube_prometheus_stack_helm_config = { - namespace = "monitoring" + kube_prometheus_stack = { + namespace = "monitoring" + name = "prometheus" + chart_version = "48.1.1" + set_sensitive = [ + { + name = "grafana.adminPassword" + value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string + }] + values = [ - file("${path.module}/monitoring/kube-stack-config.yaml") + templatefile("${path.module}/monitoring/kube-stack-config.yaml", { + storage_class_type = kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class.id, + }) ] } tags = local.tags } + +#--------------------------------------------------------------- +# Data on EKS Kubernetes Addons +#--------------------------------------------------------------- +module "eks_data_addons" { + source = "aws-ia/eks-data-addons/aws" + version = "~> 1.2.0" + + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------------------------------- + # CloudNative PG Add-on + #--------------------------------------------------------------- + enable_cnpg_operator = true + cnpg_operator_helm_config = { + namespace = "cnpg-system" + description = "CloudNativePG Operator Helm chart deployment configuration" + set = [ + { + name = "resources.limits.memory" + value = "200Mi" + }, + { + name = "resources.limits.cpu" + value = "100m" + }, + { + name = "resources.requests.cpu" + value = "100m" + }, + { + name = "resources.memory.memory" + value = "100Mi" + } + ] + } +} resource "kubectl_manifest" "cnpg_prometheus_rule" { yaml_body = file("${path.module}/monitoring/cnpg-prometheusrule.yaml") @@ -49,34 +135,3 @@ resource "kubectl_manifest" "cnpg_grafana_cm" { module.eks_blueprints_addons.kube_prometheus_stack ] } - -resource "helm_release" "cloudnative_pg" { - name = local.name - chart = "cloudnative-pg" - repository = "https://cloudnative-pg.github.io/charts" - version = "0.17.0" - namespace = "cnpg-system" - create_namespace = true - description = "CloudNativePG Operator Helm chart deployment configuration" - - set { - name = "resources.limits.cpu" - value = "100m" - } - - set { - name = "resources.limits.memory" - value = "200Mi" - } - - set { - name = "resources.requests.cpu" - value = "100m" - } - - set { - name = "resources.memory.memory" - value = "100Mi" - } - -} diff --git a/distributed-databases/cloudnative-postgres/data.tf b/distributed-databases/cloudnative-postgres/data.tf index 417116a63..dfc6223f3 100644 --- a/distributed-databases/cloudnative-postgres/data.tf +++ b/distributed-databases/cloudnative-postgres/data.tf @@ -10,6 +10,9 @@ data "aws_availability_zones" "available" { } } +data "aws_caller_identity" "current" {} + + data "aws_ami" "eks" { owners = ["amazon"] most_recent = true @@ -20,7 +23,7 @@ data "aws_ami" "eks" { } } -data "aws_iam_policy_document" "cnpg_backup" { +data "aws_iam_policy_document" "irsa_backup_policy" { statement { sid = "" effect = "Allow" diff --git a/distributed-databases/cloudnative-postgres/helm-files/values.yaml b/distributed-databases/cloudnative-postgres/helm-files/values.yaml new file mode 100644 index 000000000..0bae37355 --- /dev/null +++ b/distributed-databases/cloudnative-postgres/helm-files/values.yaml @@ -0,0 +1,515 @@ +# +# Copyright The CloudNativePG Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Default values for CloudNativePG. +# This is a YAML-formatted file. +# Please declare variables to be passed to your templates. + +replicaCount: 1 + +image: + repository: ghcr.io/cloudnative-pg/cloudnative-pg + pullPolicy: IfNotPresent + # -- Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +crds: + # -- Specifies whether the CRDs should be created when installing the chart. + create: true + +# -- The webhook configuration. +webhook: + port: 9443 + mutating: + create: true + failurePolicy: Fail + validating: + create: true + failurePolicy: Fail + livenessProbe: + initialDelaySeconds: 3 + readinessProbe: + initialDelaySeconds: 3 + +# -- Operator configuration. +config: + # -- Specifies whether the secret should be created. + create: true + # -- The name of the configmap/secret to use. + name: cnpg-controller-manager-config + # -- Specifies whether it should be stored in a secret, instead of a configmap. + secret: false + # -- The content of the configmap/secret, see + # https://cloudnative-pg.io/documentation/current/operator_conf/#available-options + # for all the available options. + data: {} + # INHERITED_ANNOTATIONS: categories + # INHERITED_LABELS: environment, workload, app + # WATCH_NAMESPACE: namespace-a,namespace-b + +# -- Additinal arguments to be added to the operator's args list. +additionalArgs: [] + +serviceAccount: + # -- Specifies whether the service account should be created. + create: true + # -- The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template. + name: "" + +rbac: + # -- Specifies whether ClusterRole and ClusterRoleBinding should be created. + create: true + +# -- Annotations to be added to all other resources. +commonAnnotations: {} +# -- Annotations to be added to the pod. +podAnnotations: {} +# -- Labels to be added to the pod. +podLabels: {} + +# -- Container Security Context. +containerSecurityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsUser: 10001 + runAsGroup: 10001 + capabilities: + drop: + - "ALL" + +# -- Security Context for the whole pod. +podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + # fsGroup: 2000 + +# -- Priority indicates the importance of a Pod relative to other Pods. +priorityClassName: "" + +service: + type: ClusterIP + # -- DO NOT CHANGE THE SERVICE NAME as it is currently used to generate the certificate + # and can not be configured + name: cnpg-webhook-service + port: 443 + +resources: + {} + # If you want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # + # limits: + # cpu: 100m + # memory: 200Mi + # requests: + # cpu: 100m + # memory: 100Mi + +# -- Nodeselector for the operator to be installed. +nodeSelector: {} + +# -- Tolerations for the operator to be installed. +tolerations: [] + +# -- Affinity for the operator to be installed. +affinity: {} + +monitoring: + # -- Specifies whether the monitoring should be enabled. Requires Prometheus Operator CRDs. + podMonitorEnabled: false + +# Default monitoring queries +monitoringQueriesConfigMap: + # -- The name of the default monitoring configmap. + name: cnpg-default-monitoring + # -- A string representation of a YAML defining monitoring queries. + queries: | + backends: + query: | + SELECT sa.datname + , sa.usename + , sa.application_name + , states.state + , COALESCE(sa.count, 0) AS total + , COALESCE(sa.max_tx_secs, 0) AS max_tx_duration_seconds + FROM ( VALUES ('active') + , ('idle') + , ('idle in transaction') + , ('idle in transaction (aborted)') + , ('fastpath function call') + , ('disabled') + ) AS states(state) + LEFT JOIN ( + SELECT datname + , state + , usename + , COALESCE(application_name, '') AS application_name + , COUNT(*) + , COALESCE(EXTRACT (EPOCH FROM (max(now() - xact_start))), 0) AS max_tx_secs + FROM pg_catalog.pg_stat_activity + GROUP BY datname, state, usename, application_name + ) sa ON states.state = sa.state + WHERE sa.usename IS NOT NULL + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - usename: + usage: "LABEL" + description: "Name of the user" + - application_name: + usage: "LABEL" + description: "Name of the application" + - state: + usage: "LABEL" + description: "State of the backend" + - total: + usage: "GAUGE" + description: "Number of backends" + - max_tx_duration_seconds: + usage: "GAUGE" + description: "Maximum duration of a transaction in seconds" + + backends_waiting: + query: | + SELECT count(*) AS total + FROM pg_catalog.pg_locks blocked_locks + JOIN pg_catalog.pg_locks blocking_locks + ON blocking_locks.locktype = blocked_locks.locktype + AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database + AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation + AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page + AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple + AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid + AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid + AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid + AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid + AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid + AND blocking_locks.pid != blocked_locks.pid + JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid + WHERE NOT blocked_locks.granted + metrics: + - total: + usage: "GAUGE" + description: "Total number of backends that are currently waiting on other queries" + + pg_database: + query: | + SELECT datname + , pg_catalog.pg_database_size(datname) AS size_bytes + , pg_catalog.age(datfrozenxid) AS xid_age + , pg_catalog.mxid_age(datminmxid) AS mxid_age + FROM pg_catalog.pg_database + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - size_bytes: + usage: "GAUGE" + description: "Disk space used by the database" + - xid_age: + usage: "GAUGE" + description: "Number of transactions from the frozen XID to the current one" + - mxid_age: + usage: "GAUGE" + description: "Number of multiple transactions (Multixact) from the frozen XID to the current one" + + pg_postmaster: + query: | + SELECT EXTRACT(EPOCH FROM pg_postmaster_start_time) AS start_time + FROM pg_catalog.pg_postmaster_start_time() + metrics: + - start_time: + usage: "GAUGE" + description: "Time at which postgres started (based on epoch)" + + pg_replication: + query: "SELECT CASE WHEN NOT pg_catalog.pg_is_in_recovery() + THEN 0 + ELSE GREATEST (0, + EXTRACT(EPOCH FROM (now() - pg_catalog.pg_last_xact_replay_timestamp()))) + END AS lag, + pg_catalog.pg_is_in_recovery() AS in_recovery, + EXISTS (TABLE pg_stat_wal_receiver) AS is_wal_receiver_up, + (SELECT count(*) FROM pg_stat_replication) AS streaming_replicas" + metrics: + - lag: + usage: "GAUGE" + description: "Replication lag behind primary in seconds" + - in_recovery: + usage: "GAUGE" + description: "Whether the instance is in recovery" + - is_wal_receiver_up: + usage: "GAUGE" + description: "Whether the instance wal_receiver is up" + - streaming_replicas: + usage: "GAUGE" + description: "Number of streaming replicas connected to the instance" + + pg_replication_slots: + query: | + SELECT slot_name, + slot_type, + database, + active, + pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), restart_lsn) + FROM pg_catalog.pg_replication_slots + WHERE NOT temporary + metrics: + - slot_name: + usage: "LABEL" + description: "Name of the replication slot" + - slot_type: + usage: "LABEL" + description: "Type of the replication slot" + - database: + usage: "LABEL" + description: "Name of the database" + - active: + usage: "GAUGE" + description: "Flag indicating whether the slot is active" + - pg_wal_lsn_diff: + usage: "GAUGE" + description: "Replication lag in bytes" + + pg_stat_archiver: + query: | + SELECT archived_count + , failed_count + , COALESCE(EXTRACT(EPOCH FROM (now() - last_archived_time)), -1) AS seconds_since_last_archival + , COALESCE(EXTRACT(EPOCH FROM (now() - last_failed_time)), -1) AS seconds_since_last_failure + , COALESCE(EXTRACT(EPOCH FROM last_archived_time), -1) AS last_archived_time + , COALESCE(EXTRACT(EPOCH FROM last_failed_time), -1) AS last_failed_time + , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_archived_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_archived_wal_start_lsn + , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_failed_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_failed_wal_start_lsn + , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time + FROM pg_catalog.pg_stat_archiver + metrics: + - archived_count: + usage: "COUNTER" + description: "Number of WAL files that have been successfully archived" + - failed_count: + usage: "COUNTER" + description: "Number of failed attempts for archiving WAL files" + - seconds_since_last_archival: + usage: "GAUGE" + description: "Seconds since the last successful archival operation" + - seconds_since_last_failure: + usage: "GAUGE" + description: "Seconds since the last failed archival operation" + - last_archived_time: + usage: "GAUGE" + description: "Epoch of the last time WAL archiving succeeded" + - last_failed_time: + usage: "GAUGE" + description: "Epoch of the last time WAL archiving failed" + - last_archived_wal_start_lsn: + usage: "GAUGE" + description: "Archived WAL start LSN" + - last_failed_wal_start_lsn: + usage: "GAUGE" + description: "Last failed WAL LSN" + - stats_reset_time: + usage: "GAUGE" + description: "Time at which these statistics were last reset" + + pg_stat_bgwriter: + query: | + SELECT checkpoints_timed + , checkpoints_req + , checkpoint_write_time + , checkpoint_sync_time + , buffers_checkpoint + , buffers_clean + , maxwritten_clean + , buffers_backend + , buffers_backend_fsync + , buffers_alloc + FROM pg_catalog.pg_stat_bgwriter + metrics: + - checkpoints_timed: + usage: "COUNTER" + description: "Number of scheduled checkpoints that have been performed" + - checkpoints_req: + usage: "COUNTER" + description: "Number of requested checkpoints that have been performed" + - checkpoint_write_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds" + - checkpoint_sync_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds" + - buffers_checkpoint: + usage: "COUNTER" + description: "Number of buffers written during checkpoints" + - buffers_clean: + usage: "COUNTER" + description: "Number of buffers written by the background writer" + - maxwritten_clean: + usage: "COUNTER" + description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" + - buffers_backend: + usage: "COUNTER" + description: "Number of buffers written directly by a backend" + - buffers_backend_fsync: + usage: "COUNTER" + description: "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)" + - buffers_alloc: + usage: "COUNTER" + description: "Number of buffers allocated" + + pg_stat_database: + query: | + SELECT datname + , xact_commit + , xact_rollback + , blks_read + , blks_hit + , tup_returned + , tup_fetched + , tup_inserted + , tup_updated + , tup_deleted + , conflicts + , temp_files + , temp_bytes + , deadlocks + , blk_read_time + , blk_write_time + FROM pg_catalog.pg_stat_database + metrics: + - datname: + usage: "LABEL" + description: "Name of this database" + - xact_commit: + usage: "COUNTER" + description: "Number of transactions in this database that have been committed" + - xact_rollback: + usage: "COUNTER" + description: "Number of transactions in this database that have been rolled back" + - blks_read: + usage: "COUNTER" + description: "Number of disk blocks read in this database" + - blks_hit: + usage: "COUNTER" + description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)" + - tup_returned: + usage: "COUNTER" + description: "Number of rows returned by queries in this database" + - tup_fetched: + usage: "COUNTER" + description: "Number of rows fetched by queries in this database" + - tup_inserted: + usage: "COUNTER" + description: "Number of rows inserted by queries in this database" + - tup_updated: + usage: "COUNTER" + description: "Number of rows updated by queries in this database" + - tup_deleted: + usage: "COUNTER" + description: "Number of rows deleted by queries in this database" + - conflicts: + usage: "COUNTER" + description: "Number of queries canceled due to conflicts with recovery in this database" + - temp_files: + usage: "COUNTER" + description: "Number of temporary files created by queries in this database" + - temp_bytes: + usage: "COUNTER" + description: "Total amount of data written to temporary files by queries in this database" + - deadlocks: + usage: "COUNTER" + description: "Number of deadlocks detected in this database" + - blk_read_time: + usage: "COUNTER" + description: "Time spent reading data file blocks by backends in this database, in milliseconds" + - blk_write_time: + usage: "COUNTER" + description: "Time spent writing data file blocks by backends in this database, in milliseconds" + + pg_stat_replication: + primary: true + query: | + SELECT usename + , COALESCE(application_name, '') AS application_name + , COALESCE(client_addr::text, '') AS client_addr + , EXTRACT(EPOCH FROM backend_start) AS backend_start + , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes + , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes + , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds + FROM pg_catalog.pg_stat_replication + metrics: + - usename: + usage: "LABEL" + description: "Name of the replication user" + - application_name: + usage: "LABEL" + description: "Name of the application" + - client_addr: + usage: "LABEL" + description: "Client IP address" + - backend_start: + usage: "COUNTER" + description: "Time when this process was started" + - backend_xmin_age: + usage: "COUNTER" + description: "The age of this standby's xmin horizon" + - sent_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location sent on this connection" + - write_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location written to disk by this standby server" + - flush_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location flushed to disk by this standby server" + - replay_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location replayed into the database on this standby server" + - write_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" + - flush_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" + - replay_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" + + pg_settings: + query: | + SELECT name, + CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting + FROM pg_catalog.pg_settings + WHERE vartype IN ('integer', 'real', 'bool') + ORDER BY 1 + metrics: + - name: + usage: "LABEL" + description: "Name of the setting" + - setting: + usage: "GAUGE" + description: "Setting value" diff --git a/distributed-databases/cloudnative-postgres/monitoring/kube-stack-config.yaml b/distributed-databases/cloudnative-postgres/monitoring/kube-stack-config.yaml index 7f0abdfcd..1a4fc51cc 100644 --- a/distributed-databases/cloudnative-postgres/monitoring/kube-stack-config.yaml +++ b/distributed-databases/cloudnative-postgres/monitoring/kube-stack-config.yaml @@ -58,13 +58,22 @@ prometheus: ruleSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: ${storage_class_type} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi grafana: enabled: true - # -- the grafana admin password - adminPassword: prom-operator - defaultDashboardsEnabled: false + defaultDashboardsEnabled: true sidecar: - dashboards: - enabled: true + dashboards: + enabled: true alertmanager: enabled: true diff --git a/distributed-databases/cloudnative-postgres/resources.tf b/distributed-databases/cloudnative-postgres/resources.tf index 18eb3f30a..e0b2f2f4c 100644 --- a/distributed-databases/cloudnative-postgres/resources.tf +++ b/distributed-databases/cloudnative-postgres/resources.tf @@ -23,7 +23,7 @@ module "ebs_csi_driver_irsa" { module "barman_s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" + version = "~> 3.8" bucket = "${random_string.random.result}-cnpg-barman-bucket" acl = "private" @@ -31,14 +31,20 @@ module "barman_s3_bucket" { # For example only - please evaluate for your environment force_destroy = true + # Bucket policies + attach_policy = true attach_deny_insecure_transport_policy = true - attach_require_latest_tls_policy = true - block_public_acls = true - block_public_policy = true - ignore_public_acls = true - restrict_public_buckets = true + # S3 Bucket Ownership Controls + # https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_ownership_controls + control_object_ownership = true + object_ownership = "BucketOwnerPreferred" + expected_bucket_owner = data.aws_caller_identity.current.account_id + versioning = { + status = true + mfa_delete = false + } server_side_encryption_configuration = { rule = { apply_server_side_encryption_by_default = { @@ -57,7 +63,7 @@ module "barman_backup_irsa" { source = "github.com/aws-ia/terraform-aws-eks-blueprints-addons?ref=ed27abc//modules/irsa" eks_cluster_id = module.eks.cluster_name eks_oidc_provider_arn = module.eks.oidc_provider_arn - irsa_iam_policies = [aws_iam_policy.cnpg_buckup_policy.arn] + irsa_iam_policies = [aws_iam_policy.irsa_policy.arn] kubernetes_namespace = "demo" kubernetes_service_account = "prod" create_kubernetes_service_account = false @@ -67,8 +73,33 @@ module "barman_backup_irsa" { #--------------------------------------------------------------- # Creates IAM policy for accessing s3 bucket #--------------------------------------------------------------- -resource "aws_iam_policy" "cnpg_buckup_policy" { +resource "aws_iam_policy" "irsa_policy" { description = "IAM role policy for CloudNativePG Barman Tool" name = "${local.name}-barman-irsa" - policy = data.aws_iam_policy_document.cnpg_backup.json + policy = data.aws_iam_policy_document.irsa_backup_policy.json +} + +#--------------------------------------------------------------- +# Grafana Admin credentials resources +#--------------------------------------------------------------- +data "aws_secretsmanager_secret_version" "admin_password_version" { + secret_id = aws_secretsmanager_secret.grafana.id + depends_on = [aws_secretsmanager_secret_version.grafana] +} + +resource "random_password" "grafana" { + length = 16 + special = true + override_special = "@_" +} + +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "grafana" { + name = "${local.name}-grafana" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "grafana" { + secret_id = aws_secretsmanager_secret.grafana.id + secret_string = random_password.grafana.result } diff --git a/distributed-databases/cloudnative-postgres/variables.tf b/distributed-databases/cloudnative-postgres/variables.tf index b31b16ab9..ded9e2948 100644 --- a/distributed-databases/cloudnative-postgres/variables.tf +++ b/distributed-databases/cloudnative-postgres/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "cnpg-on-eks" + default = "cnpg" type = string } diff --git a/distributed-databases/cloudnative-postgres/vpc.tf b/distributed-databases/cloudnative-postgres/vpc.tf index 576fde82f..ebb49fe21 100644 --- a/distributed-databases/cloudnative-postgres/vpc.tf +++ b/distributed-databases/cloudnative-postgres/vpc.tf @@ -3,6 +3,9 @@ # Supporting Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/schedulers/terraform/argo-workflow/README.md b/schedulers/terraform/argo-workflow/README.md index 6a93c8384..f44b73416 100644 --- a/schedulers/terraform/argo-workflow/README.md +++ b/schedulers/terraform/argo-workflow/README.md @@ -28,14 +28,16 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Source | Version | |------|--------|---------| | [amp\_ingest\_irsa](#module\_amp\_ingest\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | -| [data\_team\_a\_irsa](#module\_data\_team\_a\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 | | [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | -| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.3 | +| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | 1.9.2 | | [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 | -| [fluentbit\_s3\_bucket](#module\_fluentbit\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | | [irsa\_argo\_events](#module\_irsa\_argo\_events) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | +| [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | +| [spark\_team\_a\_irsa](#module\_spark\_team\_a\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | +| [vpc\_endpoints](#module\_vpc\_endpoints) | terraform-aws-modules/vpc/aws//modules/vpc-endpoints | ~> 5.0 | +| [vpc\_endpoints\_sg](#module\_vpc\_endpoints\_sg) | terraform-aws-modules/security-group/aws | ~> 5.0 | ## Resources @@ -45,19 +47,22 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [aws_iam_policy.spark](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.sqs_argo_events](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_prometheus_workspace.amp](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_workspace) | resource | +| [aws_s3_object.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_object) | resource | | [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | | [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | | [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource | -| [kubernetes_cluster_role.spark_op_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | -| [kubernetes_namespace_v1.data_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | +| [kubernetes_cluster_role.spark_argowf_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | +| [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | +| [kubernetes_cluster_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | +| [kubernetes_namespace_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | | [kubernetes_role_binding.admin_rolebinding_argoworkflows](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | -| [kubernetes_role_binding.admin_rolebinding_data_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | +| [kubernetes_role_binding.admin_rolebinding_spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | | [kubernetes_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | -| [kubernetes_secret_v1.data_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | | [kubernetes_secret_v1.event_sa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | -| [kubernetes_service_account_v1.data_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | +| [kubernetes_secret_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | | [kubernetes_service_account_v1.event_sa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | +| [kubernetes_service_account_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | | [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/3.3.2/docs/resources/password) | resource | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | @@ -77,18 +82,28 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no | +| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | +| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `true` | no | -| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"argoworkflows-eks"` | no | +| [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"doeks-spark-argo"` | no | +| [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
[
"10.1.1.0/24",
"10.1.2.0/24"
]
| no | +| [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` |
[
"10.1.0.0/26",
"10.1.0.64/26"
]
| no | | [region](#input\_region) | Region | `string` | `"us-west-2"` | no | -| [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR | `string` | `"10.1.0.0/16"` | no | +| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
[
"100.64.0.0/16"
]
| no | +| [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/16"` | no | ## Outputs | Name | Description | |------|-------------| +| [cluster\_arn](#output\_cluster\_arn) | The Amazon Resource Name (ARN) of the cluster | +| [cluster\_endpoint](#output\_cluster\_endpoint) | Endpoint for your Kubernetes API server | +| [cluster\_name](#output\_cluster\_name) | The Amazon Resource Name (ARN) of the cluster | | [configure\_kubectl](#output\_configure\_kubectl) | Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig | -| [eks\_api\_server\_url](#output\_eks\_api\_server\_url) | Your eks API server endpoint | | [grafana\_secret\_name](#output\_grafana\_secret\_name) | Grafana password secret name | +| [s3\_bucket\_id\_spark\_history\_server](#output\_s3\_bucket\_id\_spark\_history\_server) | Spark History server logs S3 bucket ID | +| [s3\_bucket\_region\_spark\_history\_server](#output\_s3\_bucket\_region\_spark\_history\_server) | Spark History server logs S3 bucket ID | +| [subnet\_ids\_starting\_with\_100](#output\_subnet\_ids\_starting\_with\_100) | Secondary CIDR Private Subnet IDs for EKS Data Plane | | [your\_event\_irsa\_arn](#output\_your\_event\_irsa\_arn) | the ARN of IRSA for argo events | diff --git a/schedulers/terraform/argo-workflow/addons.tf b/schedulers/terraform/argo-workflow/addons.tf index 9435561f4..7e2faebc3 100644 --- a/schedulers/terraform/argo-workflow/addons.tf +++ b/schedulers/terraform/argo-workflow/addons.tf @@ -1,3 +1,42 @@ +#--------------------------------------------------------------- +# GP3 Encrypted Storage Class +#--------------------------------------------------------------- + +resource "kubernetes_annotations" "gp2_default" { + annotations = { + "storageclass.kubernetes.io/is-default-class" : "false" + } + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + metadata { + name = "gp2" + } + force = true + + depends_on = [module.eks] +} + +resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { + metadata { + name = "gp3" + annotations = { + "storageclass.kubernetes.io/is-default-class" : "true" + } + } + + storage_provisioner = "ebs.csi.aws.com" + reclaim_policy = "Delete" + allow_volume_expansion = true + volume_binding_mode = "WaitForFirstConsumer" + parameters = { + fsType = "xfs" + encrypted = true + type = "gp3" + } + + depends_on = [kubernetes_annotations.gp2_default] +} + #--------------------------------------------------------------- # IRSA for EBS CSI Driver #--------------------------------------------------------------- @@ -14,13 +53,14 @@ module "ebs_csi_driver_irsa" { } tags = local.tags } + #--------------------------------------------------------------- -# EKS Blueprints Kubernetes Addons +# EKS Blueprints Addons #--------------------------------------------------------------- module "eks_blueprints_addons" { - # Short commit hash from 8th May using git rev-parse --short HEAD source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.3" + version = "1.9.2" + cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint @@ -45,6 +85,9 @@ module "eks_blueprints_addons" { } } + #--------------------------------------- + # Kubernetes Add-ons + #--------------------------------------- #--------------------------------------------------------------- # CoreDNS Autoscaler helps to scale for large EKS Clusters # Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/ @@ -66,15 +109,13 @@ module "eks_blueprints_addons" { } #--------------------------------------- - # Cluster Autoscaler + # Karpenter Autoscaler for EKS Cluster #--------------------------------------- - enable_cluster_autoscaler = true - cluster_autoscaler = { - timeout = "300" - values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { - aws_region = var.region, - eks_cluster_id = module.eks.cluster_name - })] + enable_karpenter = true + karpenter_enable_spot_termination = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password } #--------------------------------------- @@ -83,60 +124,48 @@ module "eks_blueprints_addons" { enable_aws_for_fluentbit = true aws_for_fluentbit_cw_log_group = { use_name_prefix = false - name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group + name = "/${local.name}/aws-fluentbit-logs" retention_in_days = 30 } aws_for_fluentbit = { s3_bucket_arns = [ - module.fluentbit_s3_bucket.s3_bucket_arn, - "${module.fluentbit_s3_bucket.s3_bucket_arn}/*}" + module.s3_bucket.s3_bucket_arn, + "${module.s3_bucket.s3_bucket_arn}/*}" ] values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", { region = local.region, cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" - s3_bucket_name = module.fluentbit_s3_bucket.s3_bucket_id + s3_bucket_name = module.s3_bucket.s3_bucket_id cluster_name = module.eks.cluster_name })] } - #--------------------------------------- - # Karpenter Autoscaler for EKS Cluster - #--------------------------------------- - enable_karpenter = true - karpenter_enable_spot_termination = true - karpenter = { - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------- - # AWS Load Balancer Controller - #--------------------------------------- enable_aws_load_balancer_controller = true #--------------------------------------- - # Argo Workflows + # Argo Workflows & Argo Events #--------------------------------------- enable_argo_workflows = true + argo_workflows = { + name = "argo-workflows" + namespace = "argo-workflows" + repository = "https://argoproj.github.io/argo-helm" + values = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})] + } - #--------------------------------------- - # Argo Events - #--------------------------------------- enable_argo_events = true argo_events = { - name = "argo-events" - namespace = "argo-events" - chart_version = "2.4.0" - repository = "https://argoproj.github.io/argo-helm" - values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})] + name = "argo-events" + namespace = "argo-events" + repository = "https://argoproj.github.io/argo-helm" + values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})] } - #--------------------------------------- # Prommetheus and Grafana stack #--------------------------------------- #--------------------------------------------------------------- - # Install Kafka Montoring Stack with Prometheus and Grafana + # Install Prometheus and Grafana # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` # 2- Grafana Admin user: admin # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` @@ -163,10 +192,8 @@ module "eks_blueprints_addons" { } tags = local.tags - } - #--------------------------------------------------------------- # Data on EKS Kubernetes Addons #--------------------------------------------------------------- @@ -194,44 +221,49 @@ module "eks_data_addons" { })] } -} + #--------------------------------------------------------------- + # Spark History Server Add-on + #--------------------------------------------------------------- + # Spark hsitory server is required only when EMR Spark Operator is enabled + enable_spark_history_server = true + spark_history_server_helm_config = { + values = [ + <<-EOT + sparkHistoryOpts: "-Dspark.history.fs.logDirectory=s3a://${module.s3_bucket.s3_bucket_id}/${aws_s3_object.this.key}" + EOT + ] + } -#--------------------------------------------------------------- -# Grafana Admin credentials resources -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] } -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" +#--------------------------------------- +# Karpenter Provisioners +#--------------------------------------- +data "kubectl_path_documents" "karpenter_provisioners" { + pattern = "${path.module}/karpenter-provisioners/spark-*.yaml" + vars = { + azs = local.region + eks_cluster_id = module.eks.cluster_name + } } -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name = "${local.name}-grafana" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} +resource "kubectl_manifest" "karpenter_provisioner" { + for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) + yaml_body = each.value -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result + depends_on = [module.eks_blueprints_addons] } -#--------------------------------------------------------------- -# S3 log bucket for FluentBit -#--------------------------------------------------------------- #tfsec:ignore:* -module "fluentbit_s3_bucket" { +module "s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" version = "~> 3.0" - bucket_prefix = "${local.name}-argo-workflow-logs-" + bucket_prefix = "${local.name}-spark-logs-" + # For example only - please evaluate for your environment force_destroy = true + server_side_encryption_configuration = { rule = { apply_server_side_encryption_by_default = { @@ -243,59 +275,34 @@ module "fluentbit_s3_bucket" { tags = local.tags } -#--------------------------------------- -# Karpenter Provisioners for workloads -#--------------------------------------- -data "kubectl_path_documents" "karpenter_provisioners" { - pattern = "${path.module}/karpenter-provisioners/*.yaml" - vars = { - azs = local.region - eks_cluster_id = module.eks.cluster_name - } -} - -resource "kubectl_manifest" "karpenter_provisioner" { - for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) - yaml_body = each.value - - depends_on = [module.eks_blueprints_addons] +# Creating an s3 bucket prefix. Ensure you copy Spark History event logs under this path to visualize the dags +resource "aws_s3_object" "this" { + bucket = module.s3_bucket.s3_bucket_id + key = "spark-event-logs/" + content_type = "application/x-directory" } #--------------------------------------------------------------- -# GP3 Encrypted Storage Class +# Grafana Admin credentials resources #--------------------------------------------------------------- - -resource "kubernetes_annotations" "gp2_default" { - annotations = { - "storageclass.kubernetes.io/is-default-class" : "false" - } - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - metadata { - name = "gp2" - } - force = true - - depends_on = [module.eks] +data "aws_secretsmanager_secret_version" "admin_password_version" { + secret_id = aws_secretsmanager_secret.grafana.id + depends_on = [aws_secretsmanager_secret_version.grafana] } -resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { - metadata { - name = "gp3" - annotations = { - "storageclass.kubernetes.io/is-default-class" : "true" - } - } +resource "random_password" "grafana" { + length = 16 + special = true + override_special = "@_" +} - storage_provisioner = "ebs.csi.aws.com" - reclaim_policy = "Delete" - allow_volume_expansion = true - volume_binding_mode = "WaitForFirstConsumer" - parameters = { - fsType = "xfs" - encrypted = true - type = "gp3" - } +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "grafana" { + name = "${local.name}-grafana" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} - depends_on = [kubernetes_annotations.gp2_default] +resource "aws_secretsmanager_secret_version" "grafana" { + secret_id = aws_secretsmanager_secret.grafana.id + secret_string = random_password.grafana.result } diff --git a/schedulers/terraform/argo-workflow/amp.tf b/schedulers/terraform/argo-workflow/amp.tf index a1949ffe1..96df2a495 100644 --- a/schedulers/terraform/argo-workflow/amp.tf +++ b/schedulers/terraform/argo-workflow/amp.tf @@ -1,19 +1,3 @@ -#------------------------------------------ -# Amazon Prometheus -#------------------------------------------ -locals { - amp_ingest_service_account = "amp-iamproxy-ingest-service-account" - amp_namespace = "kube-prometheus-stack" - account_id = data.aws_caller_identity.current.account_id - partition = data.aws_partition.current.partition -} - -resource "aws_prometheus_workspace" "amp" { - count = var.enable_amazon_prometheus ? 1 : 0 - - alias = format("%s-%s", "amp-ws", local.name) - tags = local.tags -} #IAM Policy for Amazon Prometheus & Grafana resource "aws_iam_policy" "grafana" { count = var.enable_amazon_prometheus ? 1 : 0 @@ -115,6 +99,21 @@ data "aws_iam_policy_document" "grafana" { } } +#------------------------------------------ +# Amazon Prometheus +#------------------------------------------ +locals { + amp_ingest_service_account = "amp-iamproxy-ingest-service-account" + amp_namespace = "kube-prometheus-stack" +} + +resource "aws_prometheus_workspace" "amp" { + count = var.enable_amazon_prometheus ? 1 : 0 + + alias = format("%s-%s", "amp-ws", local.name) + tags = local.tags +} + module "amp_ingest_irsa" { count = var.enable_amazon_prometheus ? 1 : 0 diff --git a/schedulers/terraform/argo-workflow/argo-events-manifests/eventsource-sqs.yaml b/schedulers/terraform/argo-workflow/argo-events-manifests/eventsource-sqs.yaml index 6146234af..e13e331f4 100644 --- a/schedulers/terraform/argo-workflow/argo-events-manifests/eventsource-sqs.yaml +++ b/schedulers/terraform/argo-workflow/argo-events-manifests/eventsource-sqs.yaml @@ -7,14 +7,14 @@ spec: template: serviceAccountName: event-sa sqs: - example: + sqs-spark-workflow: # jsonBody specifies that all event body payload coming from this # source will be JSON jsonBody: true # aws region - region: $region_sqs + region: # name of the queue. The eventsource resolves the url of the queue from the queue name. - queue: $queue_name + queue: # The duration (in seconds) for which the call waits for a message to arrive in the queue before returning. # MUST BE > 0 AND <= 20 waitTimeSeconds: 20 diff --git a/schedulers/terraform/argo-workflow/argo-events-manifests/sensor-sqs-sparkjobs.yaml b/schedulers/terraform/argo-workflow/argo-events-manifests/sensor-sqs-sparkjobs.yaml deleted file mode 100644 index b32c2ac15..000000000 --- a/schedulers/terraform/argo-workflow/argo-events-manifests/sensor-sqs-sparkjobs.yaml +++ /dev/null @@ -1,84 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Sensor -metadata: - name: aws-sqs-crossns-spark - namespace: argo-events -spec: - template: - serviceAccountName: operate-workflow-sa - dependencies: - - name: test-dep - eventSourceName: aws-sqs - eventName: example - triggers: - - template: - name: sqs-spark-workflow - k8s: - operation: create - source: - resource: - apiVersion: argoproj.io/v1alpha1 - kind: Workflow - metadata: - generateName: aws-sqs-spark-workflow- - namespace: argo-workflows - spec: - arguments: {} - entrypoint: parallel-jobs - templates: - - name: parallel-jobs - steps: - - - name: helloworld-job1 - template: whalesay - arguments: - parameters: [{name: message, value: "spark-start!"}] - - - name: spark-operator-job - template: sparkapp-operator - - name: helloworld-job2 - template: whalesay - arguments: - parameters: [{name: message, value: "spark-done!"}] - - name: sparkapp-operator - resource: - action: create - manifest: | - apiVersion: "sparkoperator.k8s.io/v1beta2" - kind: SparkApplication - metadata: - generateName: event-wf-sparkapp- - namespace: data-team-a - spec: - type: Python - pythonVersion: "3" - mode: cluster - image: "gcr.io/spark-operator/spark-py:v3.1.1" - imagePullPolicy: Always - mainApplicationFile: local:///opt/spark/examples/src/main/python/pi.py - sparkVersion: "3.1.1" - restartPolicy: - type: OnFailure - onFailureRetries: 3 - onFailureRetryInterval: 10 - onSubmissionFailureRetries: 5 - onSubmissionFailureRetryInterval: 20 - driver: - cores: 1 - coreLimit: "1200m" - memory: "512m" - labels: - version: 3.1.1 - serviceAccount: default - executor: - cores: 1 - instances: 2 - memory: "512m" - labels: - version: 3.1.1 - - name: whalesay - inputs: - parameters: - - name: message - container: - image: docker/whalesay - command: [cowsay] - args: ["{{inputs.parameters.message}}"] diff --git a/schedulers/terraform/argo-workflow/argo-events-manifests/sqs-accesspolicy.json b/schedulers/terraform/argo-workflow/argo-events-manifests/sqs-accesspolicy.json index 2a231d0e0..b5e1ae873 100644 --- a/schedulers/terraform/argo-workflow/argo-events-manifests/sqs-accesspolicy.json +++ b/schedulers/terraform/argo-workflow/argo-events-manifests/sqs-accesspolicy.json @@ -1,3 +1,3 @@ { - "Policy": "{\"Version\": \"2012-10-17\",\"Statement\": [{\"Action\": \"sqs:*\",\"Effect\": \"Allow\",\"Resource\": \"\",\"Principal\": {\"AWS\": \"\"}}]}" + "Policy": "{\"Version\": \"2012-10-17\",\"Statement\": [{\"Action\": \"sqs:*\",\"Effect\": \"Allow\",\"Resource\": \"\",\"Principal\": {\"AWS\": \"\"}}]}" } diff --git a/schedulers/terraform/argo-workflow/cleanup.sh b/schedulers/terraform/argo-workflow/cleanup.sh new file mode 100755 index 000000000..1f357fac9 --- /dev/null +++ b/schedulers/terraform/argo-workflow/cleanup.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -o errexit +set -o pipefail + +read -p "Enter the region: " region +export AWS_DEFAULT_REGION=$region + +targets=( + "module.eks_data_addons" + "module.eks_blueprints_addons" + "module.eks" +) + +for target in "${targets[@]}" +do + terraform destroy -target="$target" -auto-approve + destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1) + if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then + echo "SUCCESS: Terraform destroy of $target completed successfully" + else + echo "FAILED: Terraform destroy of $target failed" + exit 1 + fi +done + +terraform destroy -auto-approve +destroy_output=$(terraform destroy -auto-approve 2>&1) +if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then + echo "SUCCESS: Terraform destroy of all targets completed successfully" +else + echo "FAILED: Terraform destroy of all targets failed" + exit 1 +fi diff --git a/schedulers/terraform/argo-workflow/eks.tf b/schedulers/terraform/argo-workflow/eks.tf index 82297fba7..94c66bfa4 100644 --- a/schedulers/terraform/argo-workflow/eks.tf +++ b/schedulers/terraform/argo-workflow/eks.tf @@ -8,11 +8,14 @@ module "eks" { cluster_name = local.name cluster_version = var.eks_cluster_version - cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. + cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id - - subnet_ids = module.vpc.private_subnets + # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) manage_aws_auth_configmap = true aws_auth_roles = [ @@ -70,45 +73,47 @@ module "eks" { # Not required, but used in the example to access the nodes to inspect mounted volumes AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" } + + ebs_optimized = true + # This bloc device is used only for root volume. Adjust volume according to your size. + # NOTE: Don't use this volume for Spark workloads + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } } eks_managed_node_groups = { # We recommend to have a MNG to place your critical workloads and add-ons - # Then rely on Karpenter to scale your workloads - # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners core_node_group = { name = "core-node-group" - description = "EKS Core node group for hosting critical add-ons" - subnet_ids = module.vpc.private_subnets + description = "EKS managed node group example launch template" + # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) min_size = 3 - max_size = 8 + max_size = 9 desired_size = 3 instance_types = ["m5.xlarge"] - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - labels = { - Environment = "preprod" - Zone = "test" WorkerType = "ON_DEMAND" - NodeGroupType = "core" + NodeGroupType = "core-nodes" } - tags = merge(local.tags, { - Name = "core-node-grp", - "karpenter.sh/discovery" = local.name - }) + tags = { + Name = "core-node-group" + WorkerType = "ON_DEMAND" + NodeGroupType = "core-nodes" + } } } } diff --git a/schedulers/terraform/argo-workflow/helm-values/argo-workflows-values.yaml b/schedulers/terraform/argo-workflow/helm-values/argo-workflows-values.yaml new file mode 100644 index 000000000..2f6c9e729 --- /dev/null +++ b/schedulers/terraform/argo-workflow/helm-values/argo-workflows-values.yaml @@ -0,0 +1,5 @@ +server: + autoscaling: + enabled: true + minReplicas: 1 + serviceType: LoadBalancer diff --git a/schedulers/terraform/argo-workflow/helm-values/cluster-autoscaler-values.yaml b/schedulers/terraform/argo-workflow/helm-values/cluster-autoscaler-values.yaml deleted file mode 100644 index 5a42794f2..000000000 --- a/schedulers/terraform/argo-workflow/helm-values/cluster-autoscaler-values.yaml +++ /dev/null @@ -1,25 +0,0 @@ -autoDiscovery: - clusterName: ${eks_cluster_id} - -awsRegion: ${aws_region} - -cloudProvider: aws - -extraArgs: - aws-use-static-instance-list: true - -# Best practice to update the resource requests and limits for each add-on -resources: - limits: - cpu: 1000m - memory: 1G - requests: - cpu: 200m - memory: 512Mi - -# Best practice to updateStrategy for each add-on -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 diff --git a/schedulers/terraform/argo-workflow/install.sh b/schedulers/terraform/argo-workflow/install.sh new file mode 100755 index 000000000..5d388dfa3 --- /dev/null +++ b/schedulers/terraform/argo-workflow/install.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +read -p "Enter the region: " region +export AWS_DEFAULT_REGION=$region + +# List of Terraform modules to apply in sequence +targets=( + "module.vpc" + "module.eks" + "module.ebs_csi_driver_irsa" + "module.eks_blueprints_addons" + "module.eks_data_addons" +) + +# Initialize Terraform +terraform init -upgrade + +# Apply modules in sequence +for target in "${targets[@]}" +do + echo "Applying module $target..." + apply_output=$(terraform apply -target="$target" -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of $target completed successfully" + else + echo "FAILED: Terraform apply of $target failed" + exit 1 + fi +done + +# Final apply to catch any remaining resources +echo "Applying remaining resources..." +apply_output=$(terraform apply -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of all modules completed successfully" +else + echo "FAILED: Terraform apply of all modules failed" + exit 1 +fi diff --git a/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml new file mode 100644 index 000000000..b93858da8 --- /dev/null +++ b/schedulers/terraform/argo-workflow/karpenter-provisioners/spark-compute-optimized-provisioner.yaml @@ -0,0 +1,109 @@ +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: spark-compute-optimized + namespace: karpenter # Same namespace as Karpenter add-on installed +spec: + kubeletConfiguration: + containerRuntime: containerd + # podsPerCore: 2 + # maxPods: 20 + requirements: + - key: "topology.kubernetes.io/zone" + operator: In + values: [${azs}a] #Update the correct region and zones + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered + operator: In + values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + limits: + resources: + cpu: 2000 + providerRef: + name: spark-compute-optimized + labels: + type: karpenter + provisioner: spark-compute-optimized + NodeGroupType: SparkComputeOptimized + taints: + - key: spark-compute-optimized + value: 'true' + effect: NoSchedule + ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set + +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: spark-compute-optimized + namespace: karpenter +spec: + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + deleteOnTermination: true + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: required + subnetSelector: + Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes + securityGroupSelector: # required, when not using launchTemplate + Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes + # instanceProfile: "" # optional, if already set in controller args + #RAID0 config example + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash + echo "Running a custom user data script" + set -ex + yum install mdadm -y + + DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') + + DISK_ARRAY=() + + for DEV in $DEVICES + do + DISK_ARRAY+=("/dev/$${DEV}") + done + + DISK_COUNT=$${#DISK_ARRAY[@]} + + if [ $${DISK_COUNT} -eq 0 ]; then + echo "No SSD disks available. No further action needed." + else + if [ $${DISK_COUNT} -eq 1 ]; then + TARGET_DEV=$${DISK_ARRAY[0]} + mkfs.xfs $${TARGET_DEV} + else + mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} + mkfs.xfs /dev/md0 + TARGET_DEV=/dev/md0 + fi + + mkdir -p /local1 + echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab + mount -a + # NOTE: Update permissions on folder according to your needs and specific user group. This is just an example. + chmod 777 -R /local* + fi + + --BOUNDARY-- + + tags: + InstanceType: "spark-compute-optimized" # optional, add tags for your own use diff --git a/schedulers/terraform/argo-workflow/main.tf b/schedulers/terraform/argo-workflow/main.tf index 5ab69b71e..254231538 100644 --- a/schedulers/terraform/argo-workflow/main.tf +++ b/schedulers/terraform/argo-workflow/main.tf @@ -2,12 +2,6 @@ provider "aws" { region = local.region } -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - # ECR always authenticates with `us-east-1` region # Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html provider "aws" { @@ -15,6 +9,12 @@ provider "aws" { region = "us-east-1" } +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + provider "helm" { kubernetes { host = module.eks.cluster_endpoint @@ -24,18 +24,13 @@ provider "helm" { } provider "kubectl" { - apply_retry_count = 10 + apply_retry_count = 30 host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) load_config_file = false token = data.aws_eks_cluster_auth.this.token } -data "aws_availability_zones" "available" {} -data "aws_region" "current" {} -data "aws_caller_identity" "current" {} -data "aws_partition" "current" {} - data "aws_eks_cluster_auth" "this" { name = module.eks.cluster_name } @@ -44,14 +39,51 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr } +data "aws_availability_zones" "available" {} +data "aws_region" "current" {} +data "aws_caller_identity" "current" {} +data "aws_partition" "current" {} + #--------------------------------------------------------------- -# Local variables +# Example IAM policy for Spark job execution #--------------------------------------------------------------- +data "aws_iam_policy_document" "spark_operator" { + statement { + sid = "" + effect = "Allow" + resources = ["arn:${data.aws_partition.current.partition}:s3:::*"] + + actions = [ + "s3:DeleteObject", + "s3:DeleteObjectVersion", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObject", + ] + } + + statement { + sid = "" + effect = "Allow" + resources = ["arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*"] + + actions = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "logs:PutLogEvents", + ] + } +} + locals { - name = var.name - region = var.region - vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 2) + name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + account_id = data.aws_caller_identity.current.account_id + partition = data.aws_partition.current.partition tags = { Blueprint = local.name diff --git a/schedulers/terraform/argo-workflow/outputs.tf b/schedulers/terraform/argo-workflow/outputs.tf index 491bd13ed..277ed18b5 100644 --- a/schedulers/terraform/argo-workflow/outputs.tf +++ b/schedulers/terraform/argo-workflow/outputs.tf @@ -1,19 +1,52 @@ +################################################################################ +# Cluster +################################################################################ + +output "cluster_arn" { + description = "The Amazon Resource Name (ARN) of the cluster" + value = module.eks.cluster_arn +} + +output "cluster_name" { + description = "The Amazon Resource Name (ARN) of the cluster" + value = module.eks.cluster_id +} + output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" } -output "eks_api_server_url" { - description = "Your eks API server endpoint" +output "cluster_endpoint" { + description = "Endpoint for your Kubernetes API server" value = module.eks.cluster_endpoint } -output "your_event_irsa_arn" { - description = "the ARN of IRSA for argo events" - value = module.irsa_argo_events.iam_role_arn +################################################################################ +# Private Subnets +################################################################################ + +output "subnet_ids_starting_with_100" { + description = "Secondary CIDR Private Subnet IDs for EKS Data Plane" + value = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) +} + +output "s3_bucket_id_spark_history_server" { + description = "Spark History server logs S3 bucket ID" + value = module.s3_bucket.s3_bucket_id +} + +output "s3_bucket_region_spark_history_server" { + description = "Spark History server logs S3 bucket ID" + value = module.s3_bucket.s3_bucket_region } output "grafana_secret_name" { description = "Grafana password secret name" value = aws_secretsmanager_secret.grafana.name } + +output "your_event_irsa_arn" { + description = "the ARN of IRSA for argo events" + value = module.irsa_argo_events.iam_role_arn +} diff --git a/schedulers/terraform/argo-workflow/data-team.tf b/schedulers/terraform/argo-workflow/spark-team.tf similarity index 53% rename from schedulers/terraform/argo-workflow/data-team.tf rename to schedulers/terraform/argo-workflow/spark-team.tf index fe6fc4ba9..e8e892122 100644 --- a/schedulers/terraform/argo-workflow/data-team.tf +++ b/schedulers/terraform/argo-workflow/spark-team.tf @@ -1,38 +1,33 @@ -# Creates a Data team with all the required resources for Spark - locals { - data_team = "data-team-a" + spark_team = "spark-team-a" } -# Create a namespace for data-team-a -resource "kubernetes_namespace_v1" "data_team_a" { +resource "kubernetes_namespace_v1" "spark_team_a" { metadata { - name = local.data_team + name = local.spark_team } timeouts { delete = "15m" } } -# Create a service account for data-team-a -resource "kubernetes_service_account_v1" "data_team_a" { +resource "kubernetes_service_account_v1" "spark_team_a" { metadata { - name = local.data_team - namespace = kubernetes_namespace_v1.data_team_a.metadata[0].name - annotations = { "eks.amazonaws.com/role-arn" : module.data_team_a_irsa.iam_role_arn } + name = local.spark_team + namespace = kubernetes_namespace_v1.spark_team_a.metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.spark_team_a_irsa.iam_role_arn } } automount_service_account_token = true } -# Create a secret for data-team-a -resource "kubernetes_secret_v1" "data_team_a" { +resource "kubernetes_secret_v1" "spark_team_a" { metadata { - name = "${local.data_team}-secret" - namespace = kubernetes_namespace_v1.data_team_a.metadata[0].name + name = "${local.spark_team}-secret" + namespace = kubernetes_namespace_v1.spark_team_a.metadata[0].name annotations = { - "kubernetes.io/service-account.name" = kubernetes_service_account_v1.data_team_a.metadata[0].name - "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.data_team_a.metadata[0].name + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.spark_team_a.metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.spark_team_a.metadata[0].name } } @@ -40,76 +35,125 @@ resource "kubernetes_secret_v1" "data_team_a" { } #--------------------------------------------------------------- -# Example IAM policy for Spark job execution +# IRSA for Spark driver/executor pods for "spark-team-a" #--------------------------------------------------------------- -data "aws_iam_policy_document" "spark_operator" { - statement { - sid = "" - effect = "Allow" - resources = ["arn:${data.aws_partition.current.partition}:s3:::*"] +module "spark_team_a_irsa" { + source = "aws-ia/eks-blueprints-addon/aws" + version = "~> 1.0" - actions = [ - "s3:DeleteObject", - "s3:DeleteObjectVersion", - "s3:GetObject", - "s3:ListBucket", - "s3:PutObject", - ] - } + # Disable helm release + create_release = false - statement { - sid = "" - effect = "Allow" - resources = ["arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:log-group:*"] + # IAM role for service account (IRSA) + create_role = true + role_name = "${local.name}-${local.spark_team}" + create_policy = false + role_policies = { + spark_team_a_policy = aws_iam_policy.spark.arn + } - actions = [ - "logs:CreateLogGroup", - "logs:CreateLogStream", - "logs:DescribeLogGroups", - "logs:DescribeLogStreams", - "logs:PutLogEvents", - ] + oidc_providers = { + this = { + provider_arn = module.eks.oidc_provider_arn + namespace = local.spark_team + service_account = local.spark_team + } } } -# --------------------------------------------------------------- +#--------------------------------------------------------------- # Creates IAM policy for IRSA. Provides IAM permissions for Spark driver/executor pods -# --------------------------------------------------------------- +#--------------------------------------------------------------- resource "aws_iam_policy" "spark" { description = "IAM role policy for Spark Job execution" name = "${local.name}-spark-irsa" policy = data.aws_iam_policy_document.spark_operator.json } -# --------------------------------------------------------------- -# IRSA for Spark driver/executor pods for "data-team-a" -# --------------------------------------------------------------- -module "data_team_a_irsa" { - source = "aws-ia/eks-blueprints-addon/aws" - version = "~> 1.0" +#--------------------------------------------------------------- +# Kubernetes Cluster role for service Account spark-team-a +#--------------------------------------------------------------- +resource "kubernetes_cluster_role" "spark_role" { + metadata { + name = "spark-cluster-role" + } - #Disable helm release - create_release = false + rule { + verbs = ["get", "list", "watch"] + api_groups = [""] + resources = ["namespaces", "nodes", "persistentvolumes"] + } - #IAM role for service account (IRSA) - create_role = true - role_name = "${local.name}-${local.data_team}" - create_policy = false - role_policies = { data_team_a_policy = aws_iam_policy.spark.arn } + rule { + verbs = ["list", "watch"] + api_groups = ["storage.k8s.io"] + resources = ["storageclasses"] + } + rule { + verbs = ["get", "list", "watch", "describe", "create", "edit", "delete", "deletecollection", "annotate", "patch", "label"] + api_groups = [""] + resources = ["serviceaccounts", "services", "configmaps", "events", "pods", "pods/log", "persistentvolumeclaims"] + } - oidc_providers = { - this = { - provider_arn = module.eks.oidc_provider_arn - namespace = local.data_team - service_account = local.data_team - } + rule { + verbs = ["create", "patch", "delete", "watch"] + api_groups = [""] + resources = ["secrets"] + } + + rule { + verbs = ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] + api_groups = ["apps"] + resources = ["statefulsets", "deployments"] } + + rule { + verbs = ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] + api_groups = ["batch", "extensions"] + resources = ["jobs"] + } + + rule { + verbs = ["get", "list", "watch", "describe", "create", "edit", "delete", "annotate", "patch", "label"] + api_groups = ["extensions"] + resources = ["ingresses"] + } + + rule { + verbs = ["get", "list", "watch", "describe", "create", "edit", "delete", "deletecollection", "annotate", "patch", "label"] + api_groups = ["rbac.authorization.k8s.io"] + resources = ["roles", "rolebindings"] + } + + depends_on = [module.spark_team_a_irsa] +} +#--------------------------------------------------------------- +# Kubernetes Cluster Role binding role for service Account spark-team-a +#--------------------------------------------------------------- +resource "kubernetes_cluster_role_binding" "spark_role_binding" { + metadata { + name = "spark-cluster-role-bind" + } + + subject { + kind = "ServiceAccount" + name = local.spark_team + namespace = local.spark_team + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.spark_role.id + } + + depends_on = [module.spark_team_a_irsa] } #--------------------------------------------------------------- # Kubernetes Cluster role for argo workflows to run spark jobs #--------------------------------------------------------------- -resource "kubernetes_cluster_role" "spark_op_role" { +resource "kubernetes_cluster_role" "spark_argowf_role" { metadata { name = "spark-op-role" } @@ -122,14 +166,14 @@ resource "kubernetes_cluster_role" "spark_op_role" { } #--------------------------------------------------------------- -# Kubernetes Role binding for argo workflows/data-team-a +# Kubernetes Role binding for argo workflows/spark-team-a #--------------------------------------------------------------- -# Allow argo-workflows to run spark application in data-team-a +# Allow argo-workflows to run spark application in spark-team-a resource "kubernetes_role_binding" "spark_role_binding" { metadata { - name = "data-team-a-spark-rolebinding" - namespace = local.data_team + name = "spark-team-a-spark-rolebinding" + namespace = local.spark_team } subject { @@ -141,7 +185,7 @@ resource "kubernetes_role_binding" "spark_role_binding" { role_ref { api_group = "rbac.authorization.k8s.io" kind = "ClusterRole" - name = kubernetes_cluster_role.spark_op_role.id + name = kubernetes_cluster_role.spark_argowf_role.id } depends_on = [module.eks_blueprints_addons] @@ -169,17 +213,17 @@ resource "kubernetes_role_binding" "admin_rolebinding_argoworkflows" { depends_on = [module.eks_blueprints_addons] } -# Grant data-team-a admin role -resource "kubernetes_role_binding" "admin_rolebinding_data_team_a" { +# Grant spark-team-a admin role +resource "kubernetes_role_binding" "admin_rolebinding_spark_team_a" { metadata { - name = "data-team-a-admin-rolebinding" - namespace = local.data_team + name = "spark-team-a-admin-rolebinding" + namespace = local.spark_team } subject { kind = "ServiceAccount" name = "default" - namespace = local.data_team + namespace = local.spark_team } role_ref { @@ -188,7 +232,7 @@ resource "kubernetes_role_binding" "admin_rolebinding_data_team_a" { name = "admin" } - depends_on = [resource.kubernetes_namespace_v1.data_team_a] + depends_on = [resource.kubernetes_namespace_v1.spark_team_a] } #--------------------------------------------------------------- diff --git a/schedulers/terraform/argo-workflow/variables.tf b/schedulers/terraform/argo-workflow/variables.tf index 503e243ce..6b8b6d30d 100644 --- a/schedulers/terraform/argo-workflow/variables.tf +++ b/schedulers/terraform/argo-workflow/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "argoworkflows-eks" + default = "doeks-spark-argo" type = string } @@ -16,12 +16,50 @@ variable "eks_cluster_version" { type = string } +# VPC variable "vpc_cidr" { - description = "VPC CIDR" + description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range" default = "10.1.0.0/16" type = string } +# Routable Public subnets with NAT Gateway and Internet Gateway. Not required for fully private clusters +variable "public_subnets" { + description = "Public Subnets CIDRs. 62 IPs per Subnet/AZ" + default = ["10.1.0.0/26", "10.1.0.64/26"] + type = list(string) +} + +# Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping overlapping CIDRs +variable "private_subnets" { + description = "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc." + default = ["10.1.1.0/24", "10.1.2.0/24"] + type = list(string) +} + +# RFC6598 range 100.64.0.0/10 +# Note you can only /16 range to VPC. You can add multiples of /16 if required +variable "secondary_cidr_blocks" { + description = "Secondary CIDR blocks to be attached to VPC" + default = ["100.64.0.0/16"] + type = list(string) +} + +# EKS Worker nodes and pods will be placed on these subnets. Each Private subnet can get 32766 IPs. +# RFC6598 range 100.64.0.0/10 +variable "eks_data_plane_subnet_secondary_cidr" { + description = "Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods" + default = ["100.64.0.0/17", "100.64.128.0/17"] + type = list(string) +} + +# Enable this for fully private clusters +variable "enable_vpc_endpoints" { + description = "Enable VPC Endpoints" + default = false + type = bool +} + variable "enable_amazon_prometheus" { description = "Enable AWS Managed Prometheus service" type = bool diff --git a/schedulers/terraform/argo-workflow/vpc.tf b/schedulers/terraform/argo-workflow/vpc.tf index 83cd6071b..21762f37c 100644 --- a/schedulers/terraform/argo-workflow/vpc.tf +++ b/schedulers/terraform/argo-workflow/vpc.tf @@ -1,16 +1,31 @@ +#--------------------------------------------------------------- +# Supporting Network Resources +#--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" name = local.name - cidr = local.vpc_cidr + cidr = var.vpc_cidr + azs = local.azs + + # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods + secondary_cidr_blocks = var.secondary_cidr_blocks - azs = local.azs - private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k)] - public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 10)] + # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods + # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. + private_subnets = concat(var.private_subnets, var.eks_data_plane_subnet_secondary_cidr) + # ------------------------------ + # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments + # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW + public_subnets = var.public_subnets enable_nat_gateway = true single_nat_gateway = true + #------------------------------- public_subnet_tags = { "kubernetes.io/role/elb" = 1 @@ -18,7 +33,70 @@ module "vpc" { private_subnet_tags = { "kubernetes.io/role/internal-elb" = 1 + # Tags subnets for Karpenter auto-discovery + "karpenter.sh/discovery" = local.name } tags = local.tags } + +module "vpc_endpoints_sg" { + source = "terraform-aws-modules/security-group/aws" + version = "~> 5.0" + + create = var.enable_vpc_endpoints + + name = "${local.name}-vpc-endpoints" + description = "Security group for VPC endpoint access" + vpc_id = module.vpc.vpc_id + + ingress_with_cidr_blocks = [ + { + rule = "https-443-tcp" + description = "VPC CIDR HTTPS" + cidr_blocks = join(",", module.vpc.private_subnets_cidr_blocks) + }, + ] + + egress_with_cidr_blocks = [ + { + rule = "https-443-tcp" + description = "All egress HTTPS" + cidr_blocks = "0.0.0.0/0" + }, + ] + + tags = local.tags +} + +module "vpc_endpoints" { + source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints" + version = "~> 5.0" + + create = var.enable_vpc_endpoints + + vpc_id = module.vpc.vpc_id + security_group_ids = [module.vpc_endpoints_sg.security_group_id] + + endpoints = merge({ + s3 = { + service = "s3" + service_type = "Gateway" + route_table_ids = module.vpc.private_route_table_ids + tags = { + Name = "${local.name}-s3" + } + } + }, + { for service in toset(["autoscaling", "ecr.api", "ecr.dkr", "ec2", "ec2messages", "elasticloadbalancing", "sts", "kms", "logs", "ssm", "ssmmessages"]) : + replace(service, ".", "_") => + { + service = service + subnet_ids = module.vpc.private_subnets + private_dns_enabled = true + tags = { Name = "${local.name}-${service}" } + } + }) + + tags = local.tags +} diff --git a/schedulers/terraform/argo-workflow/workflow-example/argo-spark-operator.yaml b/schedulers/terraform/argo-workflow/workflow-example/argo-spark-operator.yaml deleted file mode 100644 index b4eb4b17f..000000000 --- a/schedulers/terraform/argo-workflow/workflow-example/argo-spark-operator.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Workflow -metadata: - name: spark-operator - namespace: argo-workflows -spec: - arguments: {} - entrypoint: sparkapp-operator - templates: - - name: sparkapp-operator - resource: - action: create - manifest: | - apiVersion: "sparkoperator.k8s.io/v1beta2" - kind: SparkApplication - metadata: - name: pyspark-pi - namespace: argo-workflows - spec: - type: Python - pythonVersion: "3" - mode: cluster - image: "gcr.io/spark-operator/spark-py:v3.1.1" - imagePullPolicy: Always - mainApplicationFile: local:///opt/spark/examples/src/main/python/pi.py - sparkVersion: "3.1.1" - restartPolicy: - type: OnFailure - onFailureRetries: 3 - onFailureRetryInterval: 10 - onSubmissionFailureRetries: 5 - onSubmissionFailureRetryInterval: 20 - driver: - cores: 1 - coreLimit: "1200m" - memory: "512m" - labels: - version: 3.1.1 - serviceAccount: default - executor: - cores: 1 - instances: 2 - memory: "512m" - labels: - version: 3.1.1 diff --git a/schedulers/terraform/argo-workflow/workflow-examples/argo-spark-operator.yaml b/schedulers/terraform/argo-workflow/workflow-examples/argo-spark-operator.yaml new file mode 100644 index 000000000..4970e3429 --- /dev/null +++ b/schedulers/terraform/argo-workflow/workflow-examples/argo-spark-operator.yaml @@ -0,0 +1,69 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + name: spark-operator + namespace: argo-workflows +spec: + arguments: {} + entrypoint: sparkapp-operator + nodeSelector: + NodeGroupType: SparkComputeOptimized + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + templates: + - name: sparkapp-operator + resource: + action: create + manifest: | + apiVersion: "sparkoperator.k8s.io/v1beta2" + kind: SparkApplication + metadata: + name: pyspark-pi-karpenter-compute + namespace: spark-team-a + spec: + type: Python + pythonVersion: "3" + mode: cluster + image: "public.ecr.aws/r1l5w1y9/spark-operator:3.2.1-hadoop-3.3.1-java-11-scala-2.12-python-3.8-latest" + imagePullPolicy: Always + mainApplicationFile: local:///opt/spark/examples/src/main/python/pi.py + sparkVersion: "3.1.1" + restartPolicy: + type: OnFailure + onFailureRetries: 1 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 5 + onSubmissionFailureRetryInterval: 20 + driver: + cores: 1 + coreLimit: "1200m" + memory: "4g" + labels: + version: 3.1.1 + serviceAccount: spark-team-a + # Using Karpenter provisioner nodeSelectors and tolerations + nodeSelector: + NodeGroupType: SparkComputeOptimized + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + executor: + cores: 1 + instances: 4 + memory: "4g" + serviceAccount: spark-team-a + labels: + version: 3.1.1 + # Using Karpenter provisioner nodeSelectors and tolerations + nodeSelector: + NodeGroupType: SparkComputeOptimized + karpenter.sh/capacity-type: "spot" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" diff --git a/schedulers/terraform/argo-workflow/workflow-example/argo-spark.yaml b/schedulers/terraform/argo-workflow/workflow-examples/argo-spark.yaml similarity index 94% rename from schedulers/terraform/argo-workflow/workflow-example/argo-spark.yaml rename to schedulers/terraform/argo-workflow/workflow-examples/argo-spark.yaml index d99445dcd..821dd28cb 100644 --- a/schedulers/terraform/argo-workflow/workflow-example/argo-spark.yaml +++ b/schedulers/terraform/argo-workflow/workflow-examples/argo-spark.yaml @@ -16,7 +16,7 @@ spec: "/bin/sh", "-c", "/opt/spark/bin/spark-submit \ - --master k8s:// \ + --master k8s:// \ --deploy-mode cluster \ --name sparkapp \ --class org.apache.spark.examples.SparkPi \ diff --git a/schedulers/terraform/argo-workflow/workflow-examples/pyspark-taxi-trip.py b/schedulers/terraform/argo-workflow/workflow-examples/pyspark-taxi-trip.py new file mode 100644 index 000000000..4a0cf0e05 --- /dev/null +++ b/schedulers/terraform/argo-workflow/workflow-examples/pyspark-taxi-trip.py @@ -0,0 +1,66 @@ +import logging +import sys +from datetime import datetime + +from pyspark.sql import SparkSession +from pyspark.sql.functions import * +from pyspark.sql import functions as f + +# Logging configuration +formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s') +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +handler.setFormatter(formatter) +logger = logging.getLogger() +logger.setLevel(logging.INFO) +logger.addHandler(handler) + +dt_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") +AppName = "NewYorkTaxiData" + + +def main(args): + + raw_input_folder = args[1] + transform_output_folder = args[2] + + # Create Spark Session + spark = SparkSession \ + .builder \ + .appName(AppName + "_" + str(dt_string)) \ + .getOrCreate() + + spark.sparkContext.setLogLevel("INFO") + logger.info("Starting spark application") + + logger.info("Reading Parquet file from S3") + ny_taxi_df = spark.read.parquet(raw_input_folder) + + # Add additional columns to the DF + final_ny_taxi_df = ny_taxi_df.withColumn("current_date", f.lit(datetime.now())) + + logger.info("NewYork Taxi data schema preview") + final_ny_taxi_df.printSchema() + + logger.info("Previewing New York Taxi data sample") + final_ny_taxi_df.show(20, truncate=False) + + logger.info("Total number of records: " + str(final_ny_taxi_df.count())) + + logger.info("Write New York Taxi data to S3 transform table") + final_ny_taxi_df.repartition(2).write.mode("overwrite").parquet(transform_output_folder) + + logger.info("Ending spark application") + # end spark code + spark.stop() + + return None + + +if __name__ == "__main__": + print(len(sys.argv)) + if len(sys.argv) != 3: + print("Usage: spark-etl [input-folder] [output-folder]") + sys.exit(0) + + main(sys.argv) diff --git a/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml b/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml new file mode 100644 index 000000000..c1eea5f7a --- /dev/null +++ b/schedulers/terraform/argo-workflow/workflow-examples/sensor-sqs-sparkjobs.yaml @@ -0,0 +1,318 @@ +# Pre-requisite before running this job +# 1/ Open taxi-trip-execute.sh and update and +# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) +# 3/ execute taxi-trip-execute.sh + + +apiVersion: argoproj.io/v1alpha1 +kind: Sensor +metadata: + name: aws-sqs-crossns-spark + namespace: argo-events +spec: + nodeSelector: + NodeGroupType: SparkComputeOptimized + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + template: + serviceAccountName: operate-workflow-sa + dependencies: + - name: test-dep + eventSourceName: aws-sqs + eventName: sqs-spark-workflow + triggers: + - template: + name: sqs-spark-workflow + k8s: + operation: create + source: + resource: + apiVersion: argoproj.io/v1alpha1 + kind: Workflow + metadata: + generateName: aws-sqs-spark-workflow- + namespace: argo-workflows + spec: + arguments: {} + entrypoint: parallel-jobs + nodeSelector: + NodeGroupType: SparkComputeOptimized + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + templates: + - name: parallel-jobs + steps: + - - name: helloworld-job1 + template: whalesay + arguments: + parameters: [{name: message, value: "spark-start!"}] + - - name: spark-operator-pi-job + template: sparkapp-operator-pi + - name: helloworld-job2 + template: whalesay + arguments: + parameters: [{name: message, value: "spark-done!"}] + - - name: spark-operator-taxi-job + template: sparkapp-operator-taxi + - name: whalesay + inputs: + parameters: + - name: message + container: + image: docker/whalesay + command: [cowsay] + args: ["{{inputs.parameters.message}}"] + - name: sparkapp-operator-pi + resource: + action: create + manifest: | + apiVersion: "sparkoperator.k8s.io/v1beta2" + kind: SparkApplication + metadata: + generateName: event-wf-sparkapp-pi-yunikorn- + namespace: spark-team-a + spec: + type: Python + pythonVersion: "3" + mode: cluster + image: "public.ecr.aws/r1l5w1y9/spark-operator:3.2.1-hadoop-3.3.1-java-11-scala-2.12-python-3.8-latest" + imagePullPolicy: Always + mainApplicationFile: "local:///opt/spark/examples/src/main/python/pi.py" + sparkVersion: "3.1.1" + restartPolicy: + type: Never + volumes: + - name: "test-volume" + hostPath: + path: "/tmp" + type: Directory + driver: + cores: 1 + coreLimit: "1200m" + memory: "4g" + memoryOverhead: "4g" + serviceAccount: spark-team-a + nodeSelector: + NodeGroupType: "SparkComputeOptimized" + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + labels: + version: 3.1.1 + annotations: + yunikorn.apache.org/schedulingPolicyParameters: "placeholderTimeoutSeconds=30 gangSchedulingStyle=Hard" + yunikorn.apache.org/task-group-name: "spark-driver" + yunikorn.apache.org/task-groups: |- + [{ + "name": "spark-driver", + "minMember": 1, + "minResource": { + "cpu": "1200m", + "memory": "14Gi" + }, + "nodeSelector": { + "NodeGroupType": "SparkComputeOptimized", + "karpenter.sh/capacity-type": "on-demand" + }, + "tolerations": [{"key": "spark-compute-optimized", "operator": "Exists", "effect": "NoSchedule"}] + }, + { + "name": "spark-executor", + "minMember": 4, + "minResource": { + "cpu": "1200m", + "memory": "14Gi" + }, + "nodeSelector": { + "NodeGroupType": "SparkComputeOptimized", + "karpenter.sh/capacity-type": "spot" + }, + "tolerations": [{"key": "spark-compute-optimized", "operator": "Exists", "effect": "NoSchedule"}] + }] + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + readOnly: false + executor: + cores: 1 + instances: 4 + memory: "4g" + memoryOverhead: "4g" + serviceAccount: spark-team-a + nodeSelector: + NodeGroupType: "SparkComputeOptimized" + karpenter.sh/capacity-type: "spot" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + labels: + version: 3.3.1 + annotations: + yunikorn.apache.org/task-group-name: "spark-executor" + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + readOnly: false + - name: sparkapp-operator-taxi + resource: + action: create + manifest: | + apiVersion: "sparkoperator.k8s.io/v1beta2" + kind: SparkApplication + metadata: + generateName: event-wf-sparkapp-taxi-yunikorn- + namespace: spark-team-a + spec: + type: Python + sparkVersion: "3.2.1" + pythonVersion: "3" + mode: cluster + image: "public.ecr.aws/r1l5w1y9/spark-operator:3.2.1-hadoop-3.3.1-java-11-scala-2.12-python-3.8-latest" + imagePullPolicy: IfNotPresent + mainApplicationFile: "s3a:///taxi-trip/scripts/pyspark-taxi-trip.py" # MainFile is the path to a bundled JAR, Python, or R file of the application + arguments: + - "s3a:///taxi-trip/input/" + - "s3a:///taxi-trip/output/" + hadoopConf: + "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" + "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" + "mapreduce.fileoutputcommitter.algorithm.version": "2" + sparkConf: + "spark.local.dir": "/data1" + "spark.speculation": "false" + "spark.network.timeout": "2400" + "spark.hadoop.fs.s3a.connection.timeout": "1200000" + "spark.hadoop.fs.s3a.path.style.access": "true" + "spark.hadoop.fs.s3a.connection.maximum": "200" + "spark.hadoop.fs.s3a.fast.upload": "true" + "spark.hadoop.fs.s3a.readahead.range": "256K" + "spark.hadoop.fs.s3a.input.fadvise": "random" + "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" + # Spark Event logs + "spark.eventLog.enabled": "true" + "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.rolling.enabled": "true" + "spark.eventLog.rolling.maxFileSize": "64m" + # "spark.history.fs.eventLog.rolling.maxFilesToRetain": 100 + # Expose Spark metrics for Prometheus + "spark.ui.prometheus.enabled": "true" + "spark.executor.processTreeMetrics.enabled": "true" + "spark.kubernetes.driver.annotation.prometheus.io/scrape": "true" + "spark.kubernetes.driver.annotation.prometheus.io/path": "/metrics/executors/prometheus/" + "spark.kubernetes.driver.annotation.prometheus.io/port": "4040" + "spark.kubernetes.driver.service.annotation.prometheus.io/scrape": "true" + "spark.kubernetes.driver.service.annotation.prometheus.io/path": "/metrics/driver/prometheus/" + "spark.kubernetes.driver.service.annotation.prometheus.io/port": "4040" + "spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet" + "spark.metrics.conf.*.sink.prometheusServlet.path": "/metrics/driver/prometheus/" + "spark.metrics.conf.master.sink.prometheusServlet.path": "/metrics/master/prometheus/" + "spark.metrics.conf.applications.sink.prometheusServlet.path": "/metrics/applications/prometheus/" + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 10 + onSubmissionFailureRetries: 5 + onSubmissionFailureRetryInterval: 20 + volumes: # using NVMe instance storage mounted on /mnt/k8s-disks + - name: spark-local-dir-1 + hostPath: + path: /mnt/k8s-disks + type: Directory + driver: + volumeMounts: # Points to InstanceStore 150GB NVMe SSD for shuffle spill over from memory + - name: spark-local-dir-1 + mountPath: /data1 + readOnly: false + initContainers: + - name: volume-permissions + image: public.ecr.aws/y4g4v0z7/busybox + command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ] + volumeMounts: + - mountPath: "/mnt/k8s-disks" + name: "spark-local-dir-1" + cores: 1 + coreLimit: "1200m" + memory: "4g" + memoryOverhead: "4g" + serviceAccount: spark-team-a + nodeSelector: + NodeGroupType: "SparkComputeOptimized" + karpenter.sh/capacity-type: "on-demand" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" + labels: + version: 3.2.1 + annotations: + yunikorn.apache.org/schedulingPolicyParameters: "placeholderTimeoutSeconds=30 gangSchedulingStyle=Hard" + yunikorn.apache.org/task-group-name: "spark-driver" + # minMember should match with driver and executor instances + # minResource cpu and memory should match with driver and executor cpu and memory + yunikorn.apache.org/task-groups: |- + [{ + "name": "spark-driver", + "minMember": 1, + "minResource": { + "cpu": "1200m", + "memory": "14Gi" + }, + "nodeSelector": { + "NodeGroupType": "SparkComputeOptimized", + "karpenter.sh/capacity-type": "on-demand" + }, + "tolerations": [{"key": "spark-compute-optimized", "operator": "Exists", "effect": "NoSchedule"}] + }, + { + "name": "spark-executor", + "minMember": 4, + "minResource": { + "cpu": "1200m", + "memory": "14Gi" + }, + "nodeSelector": { + "NodeGroupType": "SparkComputeOptimized", + "karpenter.sh/capacity-type": "spot" + }, + "tolerations": [{"key": "spark-compute-optimized", "operator": "Exists", "effect": "NoSchedule"}] + }] + executor: + podSecurityContext: + fsGroup: 185 + volumeMounts: + - name: spark-local-dir-1 + mountPath: /data1 + readOnly: false + initContainers: + - name: volume-permissions + image: public.ecr.aws/y4g4v0z7/busybox + command: [ 'sh', '-c', 'chown -R 185 /mnt/k8s-disks' ] + volumeMounts: + - mountPath: "/mnt/k8s-disks" + name: "spark-local-dir-1" + cores: 1 + coreLimit: "1200m" + instances: 4 + memory: "4g" + memoryOverhead: "4g" + serviceAccount: spark-team-a + labels: + version: 3.2.1 + annotations: + yunikorn.apache.org/task-group-name: "spark-executor" + nodeSelector: + NodeGroupType: "SparkComputeOptimized" + karpenter.sh/capacity-type: "spot" + tolerations: + - key: "spark-compute-optimized" + operator: "Exists" + effect: "NoSchedule" diff --git a/schedulers/terraform/argo-workflow/workflow-examples/taxi-trip-execute.sh b/schedulers/terraform/argo-workflow/workflow-examples/taxi-trip-execute.sh new file mode 100755 index 000000000..a281fbc67 --- /dev/null +++ b/schedulers/terraform/argo-workflow/workflow-examples/taxi-trip-execute.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script + +# Prerequisites for running this shell script +# 1/ Execute the shell script with the required arguments +# ./your_script.sh +# 2/ Ensure is replaced in "workflow-examples/sensor-sqs-sparkjobs.yaml" file +# 3/ Execute the shell script which creates the input data in your S3 bucket +# 4/ Run `kubectl apply -f workflow-examples/sensor-sqs-sparkjobs.yaml` to schedule the Spark job +# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" + +# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +s3_bucket="$1" +region="$2" + +INPUT_DATA_S3_PATH="s3://${s3_bucket}/taxi-trip/input/" + +# Create a local input folder +mkdir input + +# Copy PySpark Script to S3 bucket +aws s3 cp pyspark-taxi-trip.py s3://${s3_bucket}/taxi-trip/scripts/ --region ${region} + +# Copy Test Input data to S3 bucket +wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" + +# Making duplicate copies to increase the size of the data. +max=100 +for (( i=1; i <= $max; ++i )) +do + cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" +done + +aws s3 sync "input/" ${INPUT_DATA_S3_PATH} + +# Delete a local input folder +rm -rf input diff --git a/schedulers/terraform/managed-airflow-mwaa/eks.tf b/schedulers/terraform/managed-airflow-mwaa/eks.tf index 4a332bb34..b7178c1a0 100644 --- a/schedulers/terraform/managed-airflow-mwaa/eks.tf +++ b/schedulers/terraform/managed-airflow-mwaa/eks.tf @@ -5,6 +5,7 @@ module "eks" { cluster_name = local.name cluster_version = var.eks_cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. vpc_id = module.vpc.vpc_id diff --git a/schedulers/terraform/managed-airflow-mwaa/vpc.tf b/schedulers/terraform/managed-airflow-mwaa/vpc.tf index 37fa3f671..a81e8fec9 100644 --- a/schedulers/terraform/managed-airflow-mwaa/vpc.tf +++ b/schedulers/terraform/managed-airflow-mwaa/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/schedulers/terraform/self-managed-airflow/eks.tf b/schedulers/terraform/self-managed-airflow/eks.tf index bbf7a8d7f..fe6fb6cca 100644 --- a/schedulers/terraform/self-managed-airflow/eks.tf +++ b/schedulers/terraform/self-managed-airflow/eks.tf @@ -8,6 +8,7 @@ module "eks" { cluster_name = local.name cluster_version = var.eks_cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. vpc_id = module.vpc.vpc_id diff --git a/schedulers/terraform/self-managed-airflow/helm-values/airflow-values.yaml b/schedulers/terraform/self-managed-airflow/helm-values/airflow-values.yaml index 396564a95..539b5f405 100644 --- a/schedulers/terraform/self-managed-airflow/helm-values/airflow-values.yaml +++ b/schedulers/terraform/self-managed-airflow/helm-values/airflow-values.yaml @@ -38,7 +38,7 @@ ingress: annotations: alb.ingress.kubernetes.io/group.name: dataengineering alb.ingress.kubernetes.io/target-type: instance - alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/scheme: internal # Private Load Balancer can only be accessed within the VPC alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]' alb.ingress.kubernetes.io/healthcheck-path: '/health' # Enable the following if you have public/internal domain e.g., https://mycompany.com/ diff --git a/schedulers/terraform/self-managed-airflow/vpc.tf b/schedulers/terraform/self-managed-airflow/vpc.tf index 540df8337..3fb5d4bfd 100644 --- a/schedulers/terraform/self-managed-airflow/vpc.tf +++ b/schedulers/terraform/self-managed-airflow/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Network Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/streaming/flink/README.md b/streaming/flink/README.md index 9b84b5b36..876d564fb 100755 --- a/streaming/flink/README.md +++ b/streaming/flink/README.md @@ -70,7 +70,7 @@ |------|-------------|------|---------|:--------:| | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.27"` | no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | -| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `string` | `false` | no | +| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `true` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"flink-operator-doeks"` | no | | [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 32766 Subnet1 and 16382 Subnet2 IPs per Subnet | `list(string)` |
[
"10.1.0.0/17",
"10.1.128.0/18"
]
| no | diff --git a/streaming/flink/helm-values/nginx-values.yaml b/streaming/flink/helm-values/nginx-values.yaml index 191bbc429..e0fab61e6 100644 --- a/streaming/flink/helm-values/nginx-values.yaml +++ b/streaming/flink/helm-values/nginx-values.yaml @@ -3,7 +3,7 @@ controller: # For more annotations https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/guide/service/annotations/ annotations: service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: "*" service.beta.kubernetes.io/aws-load-balancer-backend-protocol: tcp diff --git a/streaming/flink/variables.tf b/streaming/flink/variables.tf index 5b66877fc..50b553397 100644 --- a/streaming/flink/variables.tf +++ b/streaming/flink/variables.tf @@ -25,7 +25,7 @@ variable "vpc_cidr" { variable "enable_vpc_endpoints" { description = "Enable VPC Endpoints" default = false - type = string + type = bool } # Only two Subnets for with low IP range for internet access diff --git a/streaming/flink/vpc.tf b/streaming/flink/vpc.tf index ba382a0b3..c6767fa55 100644 --- a/streaming/flink/vpc.tf +++ b/streaming/flink/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # Supporting Network Resources #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/streaming/kafka/examples/kafka-producers-consumers.yaml b/streaming/kafka/examples/kafka-producers-consumers.yaml new file mode 100644 index 000000000..bad91a6f4 --- /dev/null +++ b/streaming/kafka/examples/kafka-producers-consumers.yaml @@ -0,0 +1,110 @@ + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: java-kafka-producer + name: java-kafka-producer + namespace: kafka +spec: + replicas: 1 + selector: + matchLabels: + app: java-kafka-producer + template: + metadata: + labels: + app: java-kafka-producer + spec: + containers: + - name: java-kafka-producer + image: quay.io/strimzi-examples/java-kafka-producer:latest + env: + - name: STRIMZI_TOPIC + value: my-topic + - name: STRIMZI_DELAY_MS + value: "1000" + - name: STRIMZI_LOG_LEVEL + value: "INFO" + - name: STRIMZI_MESSAGE_COUNT + value: "1000000" + - name: KAFKA_BOOTSTRAP_SERVERS + value: cluster-kafka-bootstrap:9092 + - name: KAFKA_KEY_SERIALIZER + value: "org.apache.kafka.common.serialization.StringSerializer" + - name: KAFKA_VALUE_SERIALIZER + value: "org.apache.kafka.common.serialization.StringSerializer" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: java-kafka-streams + name: java-kafka-streams + namespace: kafka +spec: + replicas: 1 + selector: + matchLabels: + app: java-kafka-streams + template: + metadata: + labels: + app: java-kafka-streams + spec: + containers: + - name: java-kafka-streams + image: quay.io/strimzi-examples/java-kafka-streams:latest + env: + - name: STRIMZI_SOURCE_TOPIC + value: my-topic + - name: STRIMZI_TARGET_TOPIC + value: my-topic-reversed + - name: STRIMZI_LOG_LEVEL + value: "INFO" + - name: KAFKA_BOOTSTRAP_SERVERS + value: cluster-kafka-bootstrap:9092 + - name: KAFKA_APPLICATION_ID + value: java-kafka-streams + - name: KAFKA_DEFAULT_COMMIT_INTERVAL_MS + value: "5000" + - name: KAFKA_DEFAULT_KEY_SERDE + value: "org.apache.kafka.common.serialization.Serdes$StringSerde" + - name: KAFKA_DEFAULT_VALUE_SERDE + value: "org.apache.kafka.common.serialization.Serdes$StringSerde" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: java-kafka-consumer + name: java-kafka-consumer + namespace: kafka +spec: + replicas: 1 + selector: + matchLabels: + app: java-kafka-consumer + template: + metadata: + labels: + app: java-kafka-consumer + spec: + containers: + - name: java-kafka-consumer + image: quay.io/strimzi-examples/java-kafka-consumer:latest + env: + - name: STRIMZI_TOPIC + value: my-topic-reversed + - name: STRIMZI_LOG_LEVEL + value: "INFO" + - name: STRIMZI_MESSAGE_COUNT + value: "1000000" + - name: KAFKA_BOOTSTRAP_SERVERS + value: cluster-kafka-bootstrap:9092 + - name: KAFKA_GROUP_ID + value: java-kafka-consumer + - name: KAFKA_KEY_DESERIALIZER + value: "org.apache.kafka.common.serialization.StringDeserializer" + - name: KAFKA_VALUE_DESERIALIZER + value: "org.apache.kafka.common.serialization.StringDeserializer" diff --git a/streaming/kafka/examples/kafka-topics.yaml b/streaming/kafka/examples/kafka-topics.yaml index 55eaad49a..b11fd0734 100644 --- a/streaming/kafka/examples/kafka-topics.yaml +++ b/streaming/kafka/examples/kafka-topics.yaml @@ -1,10 +1,21 @@ -apiVersion: kafka.strimzi.io/v1beta1 +apiVersion: kafka.strimzi.io/v1beta2 kind: KafkaTopic metadata: - name: test-topic + name: my-topic namespace: kafka labels: strimzi.io/cluster: cluster spec: - partitions: 3 replicas: 3 + partitions: 12 +--- +apiVersion: kafka.strimzi.io/v1beta2 +kind: KafkaTopic +metadata: + name: my-topic-reversed + namespace: kafka + labels: + strimzi.io/cluster: cluster +spec: + replicas: 3 + partitions: 12 diff --git a/streaming/kafka/helm-values/strimzi-kafka-values.yaml b/streaming/kafka/helm-values/strimzi-kafka-values.yaml index 8731f4acb..2d755db84 100644 --- a/streaming/kafka/helm-values/strimzi-kafka-values.yaml +++ b/streaming/kafka/helm-values/strimzi-kafka-values.yaml @@ -10,7 +10,7 @@ watchAnyNamespace: true defaultImageRegistry: quay.io defaultImageRepository: strimzi -defaultImageTag: 0.35.0 +defaultImageTag: 0.38.0 nodeSelector: kubernetes.io/os: ${operating_system} diff --git a/streaming/kafka/install.sh b/streaming/kafka/install.sh index 5f2472cc6..83bc21076 100755 --- a/streaming/kafka/install.sh +++ b/streaming/kafka/install.sh @@ -12,6 +12,9 @@ targets=( "module.eks_data_addons" ) +# Initialize Terraform +terraform init -upgrade + # Apply modules in sequence for target in "${targets[@]}" do diff --git a/streaming/kafka/kafka-manifests/kafka-cluster.yaml b/streaming/kafka/kafka-manifests/kafka-cluster.yaml index 2444492c3..88752b9df 100644 --- a/streaming/kafka/kafka-manifests/kafka-cluster.yaml +++ b/streaming/kafka/kafka-manifests/kafka-cluster.yaml @@ -4,10 +4,9 @@ metadata: name: cluster namespace: kafka spec: - cruiseControl: {} kafka: - version: 3.4.0 - replicas: 3 + version: 3.6.0 + replicas: 4 listeners: - name: plain port: 9092 @@ -17,13 +16,15 @@ spec: port: 9093 type: internal tls: true + authentication: + type: tls config: offsets.topic.replication.factor: 3 transaction.state.log.replication.factor: 3 transaction.state.log.min.isr: 2 default.replication.factor: 3 min.insync.replicas: 2 - inter.broker.protocol.version: "3.4" + inter.broker.protocol.version: "3.6" resources: requests: memory: 58Gi @@ -37,11 +38,11 @@ spec: storage: type: jbod volumes: - - id: 0 - type: persistent-claim - size: 1000Gi - class: gp3 - deleteClaim: false + - id: 0 + type: persistent-claim + size: 1000Gi + class: gp3 + deleteClaim: false template: pod: tolerations: @@ -54,10 +55,10 @@ spec: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: NodeGroupType - operator: In - values: - - kafka + - key: NodeGroupType + operator: In + values: + - kafka metricsConfig: type: jmxPrometheusExporter valueFrom: @@ -66,7 +67,6 @@ spec: key: kafka-metrics-config.yml rack: topologyKey: topology.kubernetes.io/zone - kafkaExporter: {} zookeeper: replicas: 3 storage: @@ -83,6 +83,10 @@ spec: entityOperator: topicOperator: {} userOperator: {} + cruiseControl: {} + kafkaExporter: + topicRegex: ".*" + groupRegex: ".*" --- kind: ConfigMap @@ -248,3 +252,10 @@ data: labels: replicaId: "$2" memberType: "$3" + cruise-control-config.yml: | + # See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics + lowercaseOutputName: true + rules: + - pattern: kafka.cruisecontrol<>(\w+) + name: kafka_cruisecontrol_$1_$2 + type: GAUGE diff --git a/streaming/kafka/kafka-manifests/kafka-rebalance.yaml b/streaming/kafka/kafka-manifests/kafka-rebalance.yaml new file mode 100644 index 000000000..f1bb6cce9 --- /dev/null +++ b/streaming/kafka/kafka-manifests/kafka-rebalance.yaml @@ -0,0 +1,22 @@ +apiVersion: kafka.strimzi.io/v1beta2 +kind: KafkaRebalance +metadata: + name: my-rebalance + namespace: kafka + labels: + strimzi.io/cluster: cluster +spec: + goals: + - RackAwareGoal + - ReplicaCapacityGoal + - DiskCapacityGoal + - NetworkInboundCapacityGoal + - NetworkOutboundCapacityGoal + - CpuCapacityGoal + - ReplicaDistributionGoal + - DiskUsageDistributionGoal + - NetworkInboundUsageDistributionGoal + - NetworkOutboundUsageDistributionGoal + - TopicReplicaDistributionGoal + - LeaderReplicaDistributionGoal + - LeaderBytesInDistributionGoal diff --git a/streaming/kafka/main.tf b/streaming/kafka/main.tf index 9fdba8c6f..30aa04e53 100644 --- a/streaming/kafka/main.tf +++ b/streaming/kafka/main.tf @@ -33,8 +33,9 @@ module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.15" - cluster_name = local.name - cluster_version = local.cluster_version + cluster_name = local.name + cluster_version = local.cluster_version + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. cluster_endpoint_public_access = true vpc_id = module.vpc.vpc_id @@ -74,7 +75,7 @@ module "eks" { name = "core-node-group" description = "EKS managed node group example launch template" - min_size = 1 + min_size = 3 max_size = 9 desired_size = 3 @@ -85,7 +86,7 @@ module "eks" { xvda = { device_name = "/dev/xvda" ebs = { - volume_size = 100 + volume_size = 1000 volume_type = "gp3" } } @@ -98,6 +99,7 @@ module "eks" { Name = "core-node-grp" } } + kafka_node_group = { name = "kafka-node-group" description = "EKS managed node group example launch template" @@ -107,13 +109,14 @@ module "eks" { desired_size = 5 instance_types = ["r6i.2xlarge"] - ebs_optimized = true + + ebs_optimized = true # This is the root filesystem Not used by the brokers block_device_mappings = { xvda = { device_name = "/dev/xvda" ebs = { - volume_size = 100 + volume_size = 1000 volume_type = "gp3" } } diff --git a/streaming/kafka/vpc.tf b/streaming/kafka/vpc.tf index 83cd6071b..64a53d0b7 100644 --- a/streaming/kafka/vpc.tf +++ b/streaming/kafka/vpc.tf @@ -1,3 +1,6 @@ +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/streaming/nifi/helm-values/nifi-values.yaml b/streaming/nifi/helm-values/nifi-values.yaml index 6db5ca060..5487ca377 100644 --- a/streaming/nifi/helm-values/nifi-values.yaml +++ b/streaming/nifi/helm-values/nifi-values.yaml @@ -38,7 +38,7 @@ ingress: annotations: alb.ingress.kubernetes.io/backend-protocol: HTTPS alb.ingress.kubernetes.io/certificate-arn: ${ssl_cert_arn} - alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/scheme: internal # Private Load Balancer can only be accessed within the VPC alb.ingress.kubernetes.io/target-group-attributes: stickiness.enabled=true,stickiness.lb_cookie.duration_seconds=60 alb.ingress.kubernetes.io/target-type: ip external-dns.alpha.kubernetes.io/hostname: ${hostname} diff --git a/streaming/nifi/vpc.tf b/streaming/nifi/vpc.tf index 2715f853d..e684731e2 100644 --- a/streaming/nifi/vpc.tf +++ b/streaming/nifi/vpc.tf @@ -1,6 +1,9 @@ #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/tfsec.yaml b/tfsec.yaml index 73b3338dc..1fed7f24d 100644 --- a/tfsec.yaml +++ b/tfsec.yaml @@ -1,3 +1,4 @@ +--- exclude: - aws-iam-no-policy-wildcards # Wildcards required in addon IAM policies - aws-vpc-no-excessive-port-access # VPC settings left up to user implementation for recommended practices @@ -6,3 +7,18 @@ exclude: - aws-eks-no-public-cluster-access # Public access enabled for better example usability, users are recommended to disable if possible - aws-eks-encrypt-secrets # Module defaults to encrypting secrets with CMK, but this is not hardcoded and therefore a spurious error - aws-vpc-no-public-egress-sgr # Added in v1.22 + - aws-eks-enable-control-plane-logging # Control plane logging is not required for these blueprints + - aws-eks-enable-control-plane-audit # Control plane audit is not required for these blueprints + - aws-eks-no-public-cluster-endpoint-access # Public access enabled for better example usability, users are recommended to disable if possible + - aws-eks-no-public-cluster-endpoint-access-to-cidr # Public access enabled for better example usability, users are recommended to disable if possible + - aws-ec2-no-excessive-port-access # VPC settings left up to user implementation for recommended practices + - aws-ec2-no-public-ingress-acl # VPC settings left up to user implementation for recommended practices + - aws-ec2-no-public-egress-sgr # VPC settings left up to user implementation for recommended practices + - aws-s3-enable-bucket-logging # S3 Bucket Logging is not required for these blueprints + - aws-s3-no-public-buckets # Default behavior for S3 buckets, this exclusion acknowledges the intentional behavior + - aws-s3-ignore-public-acls # Default behavior for S3 buckets, this exclusion acknowledges the intentional behavior + - aws-s3-block-public-policy # Default behavior for S3 buckets, this exclusion acknowledges the intentional behavior + - aws-s3-block-public-acls # Default behavior for S3 buckets, this exclusion acknowledges the intentional behavior + - aws-s3-enable-versioning # Versioning is not required for these blueprints + - aws-s3-specify-public-access-block # Default behavior for S3 buckets, this exclusion acknowledges the intentional behavior + - aws-ec2-no-public-ip-subnet # Public IPs are required for some examples diff --git a/website/docs/bestpractices/networking/networking.md b/website/docs/bestpractices/networking/networking.md index c30836818..bcc6c2ab6 100644 --- a/website/docs/bestpractices/networking/networking.md +++ b/website/docs/bestpractices/networking/networking.md @@ -22,7 +22,7 @@ For example, your VPC may be limited to small subnets like below. In this VPC we ![Init VPC](init-vpc.png) -You can add additional VPC CIDRs from a range that is not routable across VPCs (such as the RFC 6598 range, `100.64.0.0/10`). In this case we added `100.64.0.0/16`, `100.65.0.0/16`, and `100.65.0.0/16` to the VPC (as this is the maximum CIDR size), then created new subnets with those CIDRs. +You can add additional VPC CIDRs from a range that is not routable across VPCs (such as the RFC 6598 range, `100.64.0.0/10`). In this case we added `100.64.0.0/16`, `100.65.0.0/16`, and `100.66.0.0/16` to the VPC (as this is the maximum CIDR size), then created new subnets with those CIDRs. Finally we recreated the node groups in the new subnets, leaving the existing EKS cluster control plane in place. ![expanded VPC](expanded-vpc.png) diff --git a/website/docs/blueprints/ai-ml/index.md b/website/docs/blueprints/ai-ml/index.md index ed6280ac3..32ddaf982 100644 --- a/website/docs/blueprints/ai-ml/index.md +++ b/website/docs/blueprints/ai-ml/index.md @@ -5,12 +5,13 @@ sidebar_label: Introduction # AI/ML Platforms on EKS -Running AI/ML platforms on Kubernetes can greatly simplify and automate the deployment, scaling, and management of these complex applications. There are a number of popular tools and technologies that have emerged to support this use case, including **TensorFlow**, **PyTorch** and **KubeFlow**. +Running AI/ML platforms on Kubernetes can greatly simplify and automate the deployment, scaling, and management of these complex applications. There are a number of popular tools and technologies that have emerged to support this use case, including **TensorFlow**, **PyTorch**, **Ray**, **MLFlow**, etc. + These tools make it easy to deploy AI/ML models in a containerized environment, and provide features such as automatic scaling, rolling updates, and self-healing capabilities to ensure high availability and reliability. By leveraging the power of Kubernetes, organizations can focus on building and training their AI/ML models, rather than worrying about the underlying infrastructure. With its robust ecosystem of tools and support for a wide range of use cases, Kubernetes is becoming an increasingly popular choice for running AI/ML platforms in production. The following Terraform templates are available to deploy. * [Ray on EKS](ray.md): This template deploys [RayCluster](https://docs.ray.io/en/latest/cluster/getting-started.html) on EKS. -* [Kubeflow on AWS](kubeflow.md): This template deploys the [Kubeflow on AWS](https://awslabs.github.io/kubeflow-manifests/) distribution on EKS. + * [EMR NVIDIA Spark-RAPIDS](emr-spark-rapids.md): This template deploys the EMR NVIDIA Spark-RAPIDS blueprint with NVIDIA GPU Operator. diff --git a/website/docs/blueprints/ai-ml/kubeflow.md b/website/docs/blueprints/ai-ml/kubeflow.md deleted file mode 100644 index cd57381de..000000000 --- a/website/docs/blueprints/ai-ml/kubeflow.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -sidebar_position: 6 -sidebar_label: Kubeflow on AWS ---- -import CollapsibleContent from '../../../src/components/CollapsibleContent'; - -# Kubeflow on AWS - -## Introduction - -**Kubeflow on AWS** is an open source distribution of [Kubeflow](https://www.kubeflow.org/) that allows customers to build machine learning systems with ready-made AWS service integrations. Use **Kubeflow on AWS** to streamline data science tasks and build highly reliable, secure, and scalable machine learning systems with reduced operational overheads. - -The open source repository for the **Kubeflow on AWS** distribution is available under [awslabs](https://github.com/awslabs/kubeflow-manifests) GitHub organization. - -## Kubeflow - -Kubeflow is the machine learning toolkit for Kubernetes. It provides a set of tools that enable developers to build, deploy, and manage machine learning workflows at scale. The following diagram shows Kubeflow as a platform for arranging the components of your ML system on top of Kubernetes: - -![Kubeflow](img/kubeflow-overview-platform-diagram.svg) - -*Source: https://www.kubeflow.org/docs/started/architecture/* - -## AWS Features for Kubeflow - -### Architecture - -![KubeflowOnAws](img/ML-8280-image003.jpg) - -*Source: https://aws.amazon.com/blogs/machine-learning/build-and-deploy-a-scalable-machine-learning-system-on-kubernetes-with-kubeflow-on-aws/* - -Running **Kubeflow on AWS** gives you the following feature benefits and configuration options: - -### Manage AWS compute environments - -* Provision and manage your Amazon Elastic Kubernetes Service (EKS) clusters with eksctl and easily configure multiple compute and GPU node configurations. -* Use AWS-optimized container images, based on [AWS Deep Learning Containers](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/what-is-dlc.html), with Kubeflow Notebooks. - -### CloudWatch Logs and Metrics - -* Integrate **Kubeflow on AWS** with [Amazon CloudWatch](https://aws.amazon.com/cloudwatch/) for persistent logging and metrics on EKS clusters and Kubeflow pods. -* Use [AWS Container Insights](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights.html) to collect, aggregate, and summarize metrics and logs from your containerized applications and microservices. - -### Load balancing, certificates, and identity management - -* Manage external traffic with [AWS Application Load Balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/introduction.html). -* Get started with TLS authentication using [AWS Certificate Manager](https://aws.amazon.com/certificate-manager/) and [AWS Cognito](https://aws.amazon.com/cognito/). - -### AWS database and storage solutions - -* Integrate Kubeflow with [Amazon Relational Database Service (RDS)](https://aws.amazon.com/rds/) for a highly scalable pipelines and metadata store. -* Deploy Kubeflow with integrations for [Amazon S3](https://aws.amazon.com/s3/) for an easy-to-use pipeline artifacts store. -* Use Kubeflow with [Amazon Elastic File System (EFS)](https://aws.amazon.com/efs/) for a simple, scalabale, and serverless storage solution. -* Leverage the [Amazon FSx CSI driver](https://github.com/kubernetes-sigs/aws-fsx-csi-driver) to manage Lustre file systems which are optimized for compute-intensive workloads, such as high-performance computing and machine learning. [Amazon FSx for Lustre](https://aws.amazon.com/fsx/lustre/) can scale to hundreds of GBps of throughput and millions of IOPS. - -### Integrate with Amazon SageMaker - -* Use **Kubeflow on AWS** with [Amazon SageMaker](https://aws.amazon.com/sagemaker/) to create hybrid machine learning workflows. -* Train, tune, and deploy machine learning models in Amazon SageMaker without logging into the SageMaker console using [SageMaker Operators for Kubernetes (ACK)](https://github.com/aws-controllers-k8s/sagemaker-controller). -* Create a [Kubeflow Pipeline](https://www.kubeflow.org/docs/components/pipelines/v1/introduction/#what-is-kubeflow-pipelines) built entirely using [SageMaker Components for Kubeflow Pipelines](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker), or integrate individual components into your workflow as needed. - - -## Deployment - -:::caution -Terraform deployment options mentioned below are still in preview. -::: - -:::caution -Please make sure to visit the [version compability](https://awslabs.github.io/kubeflow-manifests/docs/about/eks-compatibility/) page to ensure the Kubeflow version you are planning to run is compatible with the EKS version. -::: - -**Kubeflow on AWS** can be deployed on an existing EKS cluster using Kustomize or Helm. Additionally, terraform templates are also made available if an EKS cluster is not available and needs to be created. AWS provides various Kubeflow deployment options: - -* [Vanilla deployment](https://awslabs.github.io/kubeflow-manifests/docs/deployment/vanilla/) -* [Deployment with Amazon RDS and Amazon S3](https://awslabs.github.io/kubeflow-manifests/docs/deployment/rds-s3/) -* [Deployment with Amazon Cognito](https://awslabs.github.io/kubeflow-manifests/docs/deployment/cognito/) -* [Deployment with Amazon Cognito, Amazon RDS, and Amazon S3](https://awslabs.github.io/kubeflow-manifests/docs/deployment/cognito-rds-s3/) - -Please visit the [deployment](https://awslabs.github.io/kubeflow-manifests/docs/deployment/) documentation on the **Kubeflow on AWS** website for the deployment options available and steps for each of those options. diff --git a/website/docs/blueprints/ai-ml/monitoring-kubeflow.md b/website/docs/blueprints/ai-ml/monitoring-kubeflow.md deleted file mode 100644 index cc9818c0b..000000000 --- a/website/docs/blueprints/ai-ml/monitoring-kubeflow.md +++ /dev/null @@ -1,399 +0,0 @@ ---- -sidebar_position: 7 -sidebar_label: Observability for Kubeflow on EKS ---- -import CollapsibleContent from '../../../src/components/CollapsibleContent'; - -# Monitor Machine Learning workflows with Kubeflow on Amazon EKS - -As part of day 2 operations, customers want to monitor their Infrastructure, Amazon EKS clusters and application components. AWS customers use Amazon EKS to run machine learning workloads. Containerization allows machine learning engineers to package and distribute models easily, while Kubernetes helps in deploying, scaling, and improving. In addition to monitoring the behavior of the Amazon EKS clusters, it’s essential to monitor the behavior of machine learning workflows as well to ensure the operational resilience of workloads and platforms run by an organization. - -[Kubeflow](https://www.kubeflow.org/) is the open-source machine learning (ML) platform dedicated to making deployments of machine learning (ML) workflows on Kubernetes simple, portable and scalable. Kubeflow provides many components, including a central dashboard, multi-user Jupyter notebooks, Kubeflow Pipelines, KFServing, and Katib, as well as distributed training operators for [TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), [MXNet](https://mxnet.apache.org/versions/1.9.1/), and [XGBoost](https://xgboost.readthedocs.io/en/stable/). Kubeflow components export metrics which provides insights into the health and function of Kubeflow on [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/). - -[OpenTelemetry](https://opentelemetry.io/docs/concepts/what-is-opentelemetry/) is a set of APIs, SDKs, and tools that are designed for the creation and management of telemetry data such as traces, metrics, and logs. [AWS Distro for OpenTelemetry Collector (ADOT Collector)](https://github.com/aws-observability/aws-otel-collector) is an AWS-supported version of the upstream OpenTelemetry Collector that is fully compatible with AWS computing platforms, including EKS. It enables users to send telemetry data to AWS managed services such as Amazon CloudWatch, Amazon Managed Service for Prometheus, and AWS X-Ray. In this post, We’ll show how you can configure an [Amazon Elastic Kubernetes Service (Amazon EKS)](https://aws.amazon.com/eks/) cluster with Kubeflow, [Amazon Managed Service for Prometheus](https://aws.amazon.com/prometheus/), and[Amazon Managed Grafana](https://aws.amazon.com/grafana/) using [AWS Distro for OpenTelemetry (ADOT)](https://aws-otel.github.io/docs/introduction) for monitoring your Kubeflow machine learning workflows. - -## **Architecture** - -The following diagram shows the complete setup that we will walk through in this walk through: - -![Mon-Kubeflow](img/mon-kubeflow-1.jpg) - -## Solution Walkthrough - -### Prerequisites - -You will need the following to complete the steps in this post: - -* An [Ubuntu development environment](https://awslabs.github.io/kubeflow-manifests/docs/deployment/prerequisites/) with access to an AWS environment -* Install [awscurl](https://github.com/okigan/awscurl) which is a curl-like tool with AWS Signature Version 4 request signing on your environment - -First, Let’s start by setting a few environment variables: - -```bash -export KFL_EKS_CLUSTER=KFL-EKS-CLUSTER -export KFL_EKS_CLUSTER_V=1.25 -export KFL_ACCOUNT_ID=$(aws sts get-caller-identity --query 'Account' --output text) -export KFL_AWS_REGION=us-west-2 # Your AWS Region -export AWS_REGION=us-west-2 # Your AWS Region -export KFL_AMP_WORKSPACE_NAME=kubeflow-amp-workshop -export CLUSTER_NAME=KFL-EKS-CLUSTER -export CLUSTER_REGION=us-west-2 -export KUBEFLOW_RELEASE_VERSION=v1.7.0 -export AWS_RELEASE_VERSION=v1.7.0-aws-b1.0.1 -``` - -Next, let's start with installing prerequisites such as [AWS CLI version 2](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html), [eksctl](https://eksctl.io/introduction/#installation), [kubectl](https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html), [python3.8](https://www.python.org/downloads/release/python-389/), [yq](https://mikefarah.gitbook.io/yq/), [jq](https://stedolan.github.io/jq/download/), [awscurl,](https://github.com/okigan/awscurl)[kustomize version 5+](https://kubectl.docs.kubernetes.io/installation/kustomize/) required to run the demonstration. Clone the `awslabs/kubeflow-manifests` [repo](https://github.com/awslabs/kubeflow-manifests) and checkout a release. Substitute the value for `AWS_RELEASE_VERSION` with `v1.7.0-aws-b1.0.1` and run the following command. Read more about [releases and versioning](https://github.com/awslabs/kubeflow-manifests/blob/v1.3-branch/distributions/aws/examples/README.md#releases-and-versioning) policy to determine the right version for you for installing Kubeflow. - -```bash -git clone https://github.com/awslabs/kubeflow-manifests.git && cd kubeflow-manifests -git checkout ${AWS_RELEASE_VERSION} -git clone --branch ${KUBEFLOW_RELEASE_VERSION} https://github.com/kubeflow/manifests.git upstream -`make install``-``tools` -``` - -### Create an EKS Cluster - -Let’s create an Amazon EKS cluster using `eksctl`: - -```bash -## eksctl Cluster creation command for EKS cluster. -eksctl create cluster \ - --name $KFL_EKS_CLUSTER \ - --version $KFL_EKS_CLUSTER_V \ - --region $KFL_AWS_REGION \ - --nodegroup-name linux-nodes \ - --node-type m5.xlarge \ - --nodes 5 \ - --nodes-min 1 \ - --nodes-max 10 \ - --managed \ - --with-oidc -``` - -### Installing **Amazon Elastic Block Store (EBS) Container Storage Interface Driver** - -A [Container Storage Interface (CSI) driver](https://kubernetes.io/blog/2019/01/15/container-storage-interface-ga/) is needed in order to get your `PersisentVolumeClaims` served by a `PersistentVolume`. Please run the following commands to create Amazon EBS CSI driver IAM role and add EBS CSI add-on : - -```bash -eksctl create iamserviceaccount \ - --name ebs-csi-controller-sa \ - --namespace kube-system \ - --cluster $KFL_EKS_CLUSTER \ - --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \ - --approve \ - --role-only \ - --role-name AmazonEKS_EBS_CSI_DriverRole - -eksctl create addon \ - --name aws-ebs-csi-driver \ - --cluster $KFL_EKS_CLUSTER \ - --service-account-role-arn arn:aws:iam::$(aws sts get-caller-identity --query Account --output text):role/AmazonEKS_EBS_CSI_DriverRole \ - --force -``` - -### Installing Kubeflow - -You can install all Kubeflow official components (residing under `apps`) and all common services (residing under `common`) using the following command: - -```bash -make deploy-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=vanilla -``` - -It takes around 5 minutes for all components to get installed. Once everything is installed successfully, you can access the Kubeflow Central Dashboard. [Kubeflow on AWS](https://awslabs.github.io/kubeflow-manifests/) page has more information for learning open source distribution of [Kubeflow](https://www.kubeflow.org/) on AWS. - -After installation, it will take some time for all Pods to become ready. Make sure all Pods are ready before trying to connect, otherwise you might get unexpected errors. To check that all Kubeflow-related Pods are ready, use the following command: - -```bash -kubectl get pods -A -o json | jq -r '.items[] | select(.metadata.namespace=="cert-manager" or "istio-system" or "auth" or "knative-eventing" or "knative-serving" or "kubeflow" or "kubeflow-user-example-com") | .metadata.namespace + "|" + .metadata.name + "|" + .status.phase' -``` - -### Accessing Kubeflow Central Dashboard - -Kubeflow can be accessed via port-forward and this enables you to get started quickly without imposing any requirements on your environment. Run the following to port-forward Istio's Ingress-Gateway to local port `8080`: - -```bash -kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 -``` - -After running the command, you can access the Kubeflow Central Dashboard by doing the following: - -1. Dex is an OpenID Connect Identity (OIDC) with multiple authentication backends. Open your browser and visit `http://localhost:8080` and You should get the Dex login screen. -2. Login with the default user's credential. The default email address is `user@example.com` and the default password is `12341234`. - -![Mon-Kubeflow](img/mon-kubeflow-2.jpg) - -### Setup Amazon Managed Service for Prometheus - -A workspace in [Amazon Managed Service for Prometheus](https://aws.amazon.com/prometheus/) is a logical and isolated Prometheus server dedicated to Prometheus resources such as metrics. A workspace supports fine-grained access control for authorizing its management such as update, list, describe, delete, and the ingestion and querying of metrics. - -Please open a new terminal window and setup all environment variables as you did in start of the demonstration. Please use the below command to create an Amazon Managed Service for Prometheus workspace. - -```bash -aws amp create-workspace \ - --alias $KFL_AMP_WORKSPACE_NAME \ - --region $KFL_AWS_REGION -``` - -The Amazon Managed Service for Prometheus workspace should be created in just a few seconds. - -As a best practice, create a [VPC endpoint](https://docs.aws.amazon.com/vpc/latest/privatelink/create-interface-endpoint.html)for Amazon Managed Service for Prometheus in VPC running your Amazon EKS cluster. Please visit [Using Amazon Managed Service for Prometheus with interface VPC endpoints](https://docs.aws.amazon.com/prometheus/latest/userguide/AMP-and-interface-VPC.html) for more information. - -### Setting up the AWS Distro for OpenTelemetry (ADOT) Collector to Ingest Metrics - -Amazon Managed Service for Prometheus does not directly scrape operational metrics from containerized workloads in a Kubernetes or ECS cluster. It requires users to deploy a collection agent such as Prometheus server or an OpenTelemetry agent such as the AWS Distro for OpenTelemetry Collector in their cluster to perform this task. - -One of the easiest ways to collect Prometheus metrics from Amazon EKS workloads is by using the [AWS Distro for OpenTelemetry (ADOT) collector](https://aws-otel.github.io/docs/getting-started/collector). Customers can deploy the ADOT Collector in a variety of deployment models and easily manage configuration using the ADOT Operator. The [ADOT Operator is also available as an EKS Add-On](https://docs.aws.amazon.com/eks/latest/userguide/opentelemetry.html)for easier deployment and management. Read our [launch blog](https://aws.amazon.com/blogs/containers/metrics-and-traces-collection-using-amazon-eks-add-ons-for-aws-distro-for-opentelemetry/)to learn about this feature. - -The best way to provision permissions for resources running on EKS clusters is through [IRSA](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). The command below will use AWS CloudFormation to create a K8s namespace called `prometheus`, create a K8s Service Account called `amp-iamproxy-ingest-role`, create a new IAM Role with the `AmazonPrometheusRemoteWriteAccess` policy attached to it. It will also create a trust policy between the EKS cluster's IAM OpenID Connect Provider (OIDC) and the created Service Account. See [this link](https://eksctl.io/usage/iamserviceaccounts/) to learn more about this command. - -```bash -kubectl create namespace prometheus -eksctl create iamserviceaccount \ - --name amp-iamproxy-ingest-role \ - --namespace prometheus \ - --cluster $KFL_EKS_CLUSTER \ - --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \ - --approve \--override-existing-serviceaccounts -``` - -Next, we will grant permissions to Amazon EKS add-ons to install ADOT and then we will installing the ADOT Add-on : - -```bash -kubectl apply -f https://amazon-eks.s3.amazonaws.com/docs/addons-otel-permissions.yaml -aws eks create-addon \ - --addon-name adot \ - --cluster-name $KFL_EKS_CLUSTER -``` - -Now, wait for 30 seconds and execute the following command. You should see `"ACTIVE"` as result indicating that the add-on is installed successfully. - -```bash -aws eks describe-addon \ - --addon-name adot \ - --cluster-name $KFL_EKS_CLUSTER | jq .addon.status -``` - -Next, we will Install the OTel Collector Custom Resource Definition(CRD) and then we will configure the ADOT collector to push metrics to Amazon Managed Service for Prometheus endpoint. - -```bash -KFL_WORKSPACE_ID=$(aws amp list-workspaces \ - --alias $KFL_AMP_WORKSPACE_NAME \ - --region=${KFL_AWS_REGION} \ - --query 'workspaces[0].[workspaceId]' \ - --output text) -KFL_AMP_ENDPOINT_URL=$(aws amp describe-workspace \ - --workspace-id $KFL_WORKSPACE_ID | jq .workspace.prometheusEndpoint -r) -KFL_AMP_REMOTE_WRITE_URL=${KFL_AMP_ENDPOINT_URL}api/v1/remote_write -curl -O https://raw.githubusercontent.com/aws-samples/one-observability-demo/main/PetAdoptions/cdk/pet_stack/resources/otel-collector-prometheus.yaml -sed -i -e s/AWS_REGION/$KFL_AWS_REGION/g otel-collector-prometheus.yaml -sed -i -e s^AMP_WORKSPACE_URL^$KFL_AMP_REMOTE_WRITE_URL^g otel-collector-prometheus.yaml -kubectl apply -f ./otel-collector-prometheus.yaml -``` - -Now, lets verify that the ADOT collector is running and you should see a result like the one below showing that the collector has been successfully installed and being ready. - -```bash -kubectl get all -n prometheus -``` - -``` -NAME READY STATUS RESTARTS AGEpod/observability-collector-5774bbc68d-7nj54 1/1 Running 0 59s - -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -service/observability-collector-monitoring ClusterIP 10.100.114.1 8888/TCP 59s - -NAME READY UP-TO-DATE AVAILABLE AGE -deployment.apps/observability-collector 1/1 1 1 59s - -NAME DESIRED CURRENT READY AGE -replicaset.apps/observability-collector-5774bbc68d 1 1 1 59s -``` - -Now you have successfully deployed the ADOT Collector to collect metrics from the EKS cluster and send it to the Amazon Managed Service for Prometheus workspace you created. To test whether Amazon Managed Service for Prometheus received the metrics, use `awscurl`. This tool enables you to send HTTP requests through the command line with AWS Sigv4 authentication, so you must have AWS credentials set up locally with the correct permissions to query from Amazon Managed Service for Prometheus. For instructions on installing awscurl, see [awscurl](https://github.com/okigan/awscurl). - -```bash -awscurl --service="aps" \ - --region="$KFL_AWS_REGION" "https://aps-workspaces.$KFL_AWS_REGION.amazonaws.com/workspaces/$KFL_WORKSPACE_ID/api/v1/query?query=istio_requests_total" -``` - -Your results should look similar to shown below: - -``` -{ - "status": "success", - "data": { - "resultType": "vector", - "result": [ - { - "metric": { - "__name__": "istio_requests_total", - "app": "istio-ingressgateway", - "chart": "gateways", - .................................... - .................................... - "version": "v1" - }, - "value": [ - 1647974689.212, - "1" - ] - } - ] - } -} -``` - -### Amazon Managed Grafana Setup - -Two steps are necessary for setting up AWS IAM Identity Center, setting up and logging in to Amazon Managed Grafana, and querying metrics from Amazon Managed Service for Prometheus workspace from the post. To set up Authentication and Authorization, follow the instructions in the [Amazon Managed Grafana User Guide](https://docs.aws.amazon.com/grafana/latest/userguide/AMG-manage-users-and-groups-AMG.html) for enabling AWS IAM Identity Center. Second, setup the data source for [Amazon Managed Service for Prometheus](https://docs.aws.amazon.com/grafana/latest/userguide/AMP-adding-AWS-config.html). You may also reference [Monitor Istio on EKS using Amazon Managed Prometheus and Amazon Managed Grafana](https://aws.amazon.com/blogs/mt/monitor-istio-on-eks-using-amazon-managed-prometheus-and-amazon-managed-grafana/#:~:text=AWS%20Single%20Sign%2DOn%20(SSO)) blog, starting from the AWS Single Sign-On (SSO) section for Amazon Managed Grafana setup. - -### Query Kubeflow Metrics - -Next lets navigate to Amazon Managed Grafana console and import Grafana dashboards which allows us to visualize metrics from Istio environment. Go to the `plus` sign on the left navigation bar and select `Import` as shown below: - -![Mon-Kubeflow](img/mon-kubeflow-3.jpg) - -In the Import screen, type `7630 (Istio Workload Dashboard)` in `Import via grafana.com` textbox and click `Load. `Select the Prometheus data source in the drop down at the bottom and click on `Import`. Once complete, you will be able to see the Grafana dashboard showing metrics from the `Istio Workload Dashboard` through Prometheus data source as shown below: - -![Mon-Kubeflow](img/mon-kubeflow-4.jpg) - -### Creating a sample Machine Learning pipeline in Kubeflow - -Now that we have configured Amazon Managed Grafana with the Prometheus data source within our cluster, we can initiate a Machine Learning pipeline in Kubeflow, and be able to display metrics on the Grafana dashboards. - -Before we create the notebook to use Kubeflow Pipelines SDK, we have to supply a token so that the notebook can authenticate with the Kubeflow Pipelines API. To do so, run the following command to create a Pod to mount a token volume: - -```yaml -cat < Clone a Repository` from the top navigation bar and paste `https://github.com/aws-samples/aws-deeplearning-labs` and press enter. -* Open the following notebook from the directory view in the left pane: `aws-deeplearning-labs/workshop/pytorch-distributed-training/STEP2_simple_xgboost_training_pipeline.ipynb`. -* Run all the cells of the model by selecting `Kernel -> Restart Kernel and Run All Cells` from the top menu - -![Mon-Kubeflow](img/mon-kubeflow-7.jpg) - -### Visualizing Machine Learning pipeline metrics on Amazon Managed Grafana - -Using the Amazon Managed Grafana, we can show the resource utilization from our Machine Learning Pipelines with the same method we used to look above: using the `Istio Workload Dashboard` (7630). Select the following to monitor your resources for this particular ML pipeline: - -* Datasource: `your prometheus workspace name` -* Namespace: `kubeflow-user-example-com` -* Workload: `ml-training-notebook` - -![Mon-Kubeflow](img/mon-kubeflow-8.jpg) - -### Alerting Kubeflow workflows with Amazon Managed Grafana - -As we configure workflows with Kubeflow, alerting is a mechanism we can employ to alert on specific situations. By quickly identifying unintended changes in your workflow and notifying the same using alerts, you can take actions to minimize disruptions to your services. Amazon Managed Grafana supports multiple notification channels such as SNS, Slack, PagerDuty etc to which you can send alerts notifications. [Alerts](https://docs.aws.amazon.com/grafana/latest/userguide/alerts-overview.html) page will show you more information on how to setup alerts in Amazon Managed Grafana. You learn about setting up alerts from Amazon Managed Grafana to [Slack](https://slack.com/) from our [Monitoring hybrid environments using Amazon Managed Grafana](https://aws.amazon.com/blogs/mt/monitoring-hybrid-environments-using-amazon-managed-service-for-grafana/) blog. Also check our Blog on [Monitor Istio on EKS using Amazon Managed Prometheus and Amazon Managed Grafana](https://aws.amazon.com/blogs/mt/monitor-istio-on-eks-using-amazon-managed-prometheus-and-amazon-managed-grafana/) which will show you on triggering Amazon Managed Grafana alerts to [PagerDuty](https://www.pagerduty.com/). - -## Clean-up - -Use the following commands to clean up the created AWS resources for this demonstration: - -```bash -# Clean up ADOT Collector and Prometheus. -kubectl delete -f https://amazon-eks.s3.amazonaws.com/docs/addons-otel-permissions.yaml -kubectl delete -f ./otel-collector-prometheus.yaml -rm -rf ./otel-collector-prometheus.yaml - -aws eks delete-addon \ - --addon-name adot \ - --cluster-name $KFL_EKS_CLUSTER - -aws amp delete-workspace \ - --workspace-id $KFL_WORKSPACE_ID \ - --region $KFL_AWS_REGION - -eksctl delete iamserviceaccount \ - --name amp-iamproxy-ingest-role \ - --namespace prometheus \ - --cluster $KFL_EKS_CLUSTER - -kubectl delete namespace prometheus - -# Cleaning up kubeflow installation components -make delete-kubeflow INSTALLATION_OPTION=kustomize DEPLOYMENT_OPTION=vanilla -cd .. -rm -rf kubeflow-manifests - -eksctl delete iamserviceaccount \ - --name ebs-csi-controller-sa \ - --namespace kube-system \ - --cluster $KFL_EKS_CLUSTER - -aws eks delete-addon \ - --addon-name aws-ebs-csi-driver \ - --cluster-name $KFL_EKS_CLUSTER - -# Cleaning up Amazon EKS Cluster. -eksctl delete cluster --region $AWS_REGION --name $KFL_EKS_CLUSTER -``` - -Finally navigate to Amazon Managed Grafana console to delete the created Grafana workspace. - -## Conclusion - -This post demonstrated the detailed steps on how you can setup Amazon EKS cluster with Kubeflow, Amazon Managed Service for Prometheus and Amazon Managed Grafana to monitor your Kubeflow machine learning workflows. - -It is also important to have a centralized incident management process to keep systems running smoothly. You can view more details on alerting in and various supported providers at [alert notifications](https://docs.aws.amazon.com/grafana/latest/userguide/alert-notifications.html) for Amazon Managed Grafana. You can also check out previous blogs posts [Amazon Managed Service for Prometheus Alert Manager to receive alerts with PagerDuty](https://aws.amazon.com/blogs/mt/using-amazon-managed-service-for-prometheus-alert-manager-to-receive-alerts-with-pagerduty/) and [how to integrate Amazon Managed Service for Prometheus with Slack](https://aws.amazon.com/blogs/mt/how-to-integrate-amazon-managed-service-for-prometheus-with-slack/) to see how you can setup alerting with Amazon Managed Service for Prometheus. - -For further reading on Kubeflow deployment and monitoring on Amazon EKS, check out [Build and deploy a scalable machine learning system on Kubernetes with Kubeflow on AWS](https://aws.amazon.com/blogs/machine-learning/build-and-deploy-a-scalable-machine-learning-system-on-kubernetes-with-kubeflow-on-aws/) and [CloudWatch add-on for Kubeflow.](https://awslabs.github.io/kubeflow-manifests/docs/deployment/add-ons/cloudwatch/guide/). diff --git a/website/docs/blueprints/data-analytics/datahub-on-eks.md b/website/docs/blueprints/data-analytics/datahub-on-eks.md index e1195c17e..87496cd04 100644 --- a/website/docs/blueprints/data-analytics/datahub-on-eks.md +++ b/website/docs/blueprints/data-analytics/datahub-on-eks.md @@ -78,7 +78,7 @@ chmod +x install.sh ### Verify Deployment -After the deployment completes, we can access the DataHub UI and test importing metadata from sample datasources. For demo purpose, this blueprint creates the Ingress object for the datahub FrontEnd UI with public LoadBalancer(internet-facing). For production workloads, you can modify datahub_values.yaml to use internal LB: +After the deployment completes, we can access the DataHub UI and test importing metadata from sample datasources. For demo purpose, this blueprint creates the Ingress object for the datahub FrontEnd UI with public LoadBalancer(internal # Private Load Balancer can only be accessed within the VPC). For production workloads, you can modify datahub_values.yaml to use internal LB: ``` datahub-frontend: @@ -90,7 +90,7 @@ datahub-frontend: enabled: true annotations: kubernetes.io/ingress.class: alb - alb.ingress.kubernetes.io/scheme: **internet-facing** + alb.ingress.kubernetes.io/scheme: **internal # Private Load Balancer can only be accessed within the VPC** alb.ingress.kubernetes.io/target-type: instance ``` diff --git a/website/docs/blueprints/distributed-databases/cloudnative-postgres.md b/website/docs/blueprints/distributed-databases/cloudnative-postgres.md index b20bfbdf3..4a9748288 100644 --- a/website/docs/blueprints/distributed-databases/cloudnative-postgres.md +++ b/website/docs/blueprints/distributed-databases/cloudnative-postgres.md @@ -55,7 +55,7 @@ Navigate into cloudnative-postgres folder and run `install.sh` script. By defaul ```bash cd data-on-eks/distributed-databases/cloudnative-postgres -./install .sh +./install.sh ``` ### Verify Deployment diff --git a/website/docs/blueprints/job-schedulers/argo-workflows-eks.md b/website/docs/blueprints/job-schedulers/argo-workflows-eks.md index 3d895c219..93e024b17 100644 --- a/website/docs/blueprints/job-schedulers/argo-workflows-eks.md +++ b/website/docs/blueprints/job-schedulers/argo-workflows-eks.md @@ -77,8 +77,8 @@ kubectl get ns # Output should look like below NAME STATUS AGE argo-events Active 7m45s -argo-workflows Active 8m25s -data-team-a Active 5m51s +argo-workflows Active 8m25s +spark-team-a Active 5m51s default Active 25m karpenter Active 21m kube-node-lease Active 25m @@ -91,24 +91,32 @@ yunikorn Active 5m44s ### Access Argo Workflow WebUI +Get the load balancer url: + +```bash +kubectl -n argo-workflows get service argo-workflows-server -o jsonpath="{.status.loadBalancer.ingress[*].hostname}{'\n'}" +``` + +Copy and paste the result in your browser. +The initial username is `admin`. The login token is autogenerated and you can get it by running the following command: + ```bash -kubectl -n argo-workflows port-forward deployment.apps/argo-workflows-server 2746:2746 argo auth token # get login token # result: Bearer k8s-aws-v1.aHR0cHM6Ly9zdHMudXMtd2VzdC0yLmFtYXpvbmF3cy5jb20vP0FjdGlvbj1HZXRDYWxsZXJJZGVudGl0eSZWZXJzaW9uPTIwMTEtMDYtMTUmWC1BbXotQWxnb3JpdGhtPUFXUzQtSE1BQy1TSEEyNTYmWC1BbXotQ3JlZGVudGlhbD1BS0lBVkNWNFhDV1dLUjZGVTRGMiUyRjIwMjIxMDEzJTJGdXMtd2VzdC0yJTJGc3RzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyMjEwMTNUMDIyODAyWiZYLUFtei1FeHBpcmVzPTYwJlgtQW16LVNpZ25lZEhlYWRlcnM9aG9zdCUzQngtazhzLWF3cy1pZCZYLUFtei1TaWduYXR1cmU9NmZiNmMxYmQ0MDQyMWIwNTI3NjY4MzZhMGJiNmUzNjg1MTk1YmM0NDQzMjIyMTg5ZDNmZmE1YzJjZmRiMjc4OA ``` -Open browser and enter `http://localhost:2746/` and paste the token - ![argo-workflow-login](img/argo-workflow-login.png) ### Submit Spark Job with Argo Workflow -Modify `workflow-example/argo-spark.yaml` with your eks api server url +Export EKS API from `terraform output` ```bash -kubectl apply -f workflow-example/argo-spark.yaml +eks_api_url=https://ABCDEFG1234567890.yl4.eu-west-2.eks.amazonaws.com + +cat workflow-examples/argo-spark.yaml | sed "s//$eks_api_url/g" | kubectl apply -f - kubectl get wf -n argo-workflows NAME STATUS AGE MESSAGE @@ -122,7 +130,7 @@ You can also check the workflow status from Web UI ### Submit Spark Job with Spark Operator and Argo Workflow ```bash -kubectl apply -f workflow-example/argo-spark-operator.yaml +kubectl apply -f workflow-examples/argo-spark-operator.yaml kubectl get wf -n argo-workflows NAME STATUS AGE MESSAGE @@ -148,9 +156,9 @@ In this case, we configure a EventSource to license to the queue `test1` in regi ```bash queue_name=test1 -reqion_sqs=us-east-1 +region_sqs=us-east-1 -kubectl apply -f argo-events-manifests/eventsource-sqs.yaml +cat argo-events-manifests/eventsource-sqs.yaml | sed "s//$region_sqs/g;s//$queue_name/g" | kubectl apply -f - ``` Let's create that queue in your account. @@ -160,19 +168,33 @@ Let's create that queue in your account. queue_url=$(aws sqs create-queue --queue-name $queue_name --region $region_sqs --output text) # get your queue arn -aws sqs get-queue-attributes --queue-url $queue_url --attribute-names QueueArn --region $region_sqs +sqs_queue_arn=$(aws sqs get-queue-attributes --queue-url $queue_url --attribute-names QueueArn --region $region_sqs --query "Attributes.QueueArn" --output text) + +template=`cat argo-events-manifests/sqs-accesspolicy.json | sed -e "s||$sqs_queue_arn|g;s||$your_event_irsa_arn|g"` -#Replace the following values in argo-events/sqs-accesspolicy.json -# -# (you can get from terraform output) -aws sqs set-queue-attributes --queue-url $queue_url --attributes file://argo-events/sqs-accesspolicy.json --region $region_sqs +aws sqs set-queue-attributes --queue-url $queue_url --attributes $template --region $region_sqs ``` ### Deploy `sensor-rbac.yaml` and `sensor-sqs-spark-crossns.yaml` for triggering workflow ```bash kubectl apply -f argo-events-manifests/sensor-rbac.yaml -kubectl apply -f argo-events-manifests/sensor-sqs-sparkjobs.yaml +``` + +```bash +cd workflow-examples +``` + +Update the variables in Shell script and execute + +```bash +./taxi-trip-execute.sh +``` + +Update YAML file and run the below command + +```bash +kubectl apply -f sensor-sqs-sparkjobs.yaml ``` ### Verify argo-events namespace @@ -250,10 +272,10 @@ NAMESPACE NAME STATUS AGE MESSAGE argo-workflows aws-sqs-spark-workflow-hh79p Running 11s ``` -Run the command below to check spark application driver pods and executor pods under data-team-a namespace. +Run the command below to check spark application driver pods and executor pods under spark-team-a namespace. ```bash -kubectl get po -n data-team-a +kubectl get po -n spark-team-a # Output should look like below NAME READY STATUS RESTARTS AGE @@ -276,7 +298,5 @@ To teardown and remove the resources created in this example: ```bash kubectl delete -f argo-events-manifests/. -terraform destroy -target="module.irsa_argo_events" -target="module.kubernetes_data_addons" -target="module.eks_blueprints_addons" -auto-approve -var region=$region -terraform destroy -target="module.eks" -auto-approve -var region=$region -terraform destroy -auto-approve -var region=$region +./cleanup.sh ``` diff --git a/website/docs/blueprints/job-schedulers/self-managed-airflow.md b/website/docs/blueprints/job-schedulers/self-managed-airflow.md index 137395cff..fcd77a86f 100644 --- a/website/docs/blueprints/job-schedulers/self-managed-airflow.md +++ b/website/docs/blueprints/job-schedulers/self-managed-airflow.md @@ -150,7 +150,7 @@ Amazon Postgres RDS database password can be fetched from the Secrets manager ### Login to Airflow Web UI -This deployment creates an Ingress object with public LoadBalancer(internet-facing) for demo purpose +This deployment creates an Ingress object with public LoadBalancer(internal # Private Load Balancer can only be accessed within the VPC) for demo purpose For production workloads, you can modify `airflow-values.yaml` to choose `internal` LB. In addition, it's also recommended to use Route53 for Airflow domain and ACM for generating certificates to access Airflow on HTTPS port. Execute the following command to get the ALB DNS name diff --git a/website/docs/gen-ai/index.md b/website/docs/gen-ai/index.md new file mode 100644 index 000000000..fe479145b --- /dev/null +++ b/website/docs/gen-ai/index.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 1 +sidebar_label: Overview +--- + +# Gen AI on EKS + +Welcome to Gen AI on Amazon Elastic Kubernetes Service (EKS), your gateway to harnessing the power of Large Language Models (LLMs) for a wide range of applications. This introduction page serves as your starting point to explore our offerings for Training, Fine-tuning, and Inference using various LLMs, including BERT-Large, Llama2, Stable Diffusion, and more. + +## [Training](https://awslabs.github.io/docs/category/training-on-eks) +Are you ready to dive into the world of LLMs and train models for your specific needs? Discover our comprehensive Training resources to get started. + +## [Fine-tuning](https://awslabs.github.io/docs/category/training-on-eks) +Fine-tuning LLMs is crucial for tailoring them to your specific tasks. Explore our Fine-tuning section to learn how to adapt LLMs to your unique requirements. + +## [Inference](https://awslabs.github.io/docs/category/inference-on-eks) +Unlock the potential of LLMs for powerful inference tasks. Our Inference resources will guide you through deploying LLMs effectively. + +Whether you're an experienced practitioner or new to the field, our Gen AI on EKS capabilities empower you to harness the latest advancements in language modeling. Dive into each section to begin your journey. diff --git a/website/docs/gen-ai/inference/Llama2.md b/website/docs/gen-ai/inference/Llama2.md new file mode 100644 index 000000000..7fc77accb --- /dev/null +++ b/website/docs/gen-ai/inference/Llama2.md @@ -0,0 +1,306 @@ +--- +title: Llama-2 on Inferentia +sidebar_position: 1 +--- +import CollapsibleContent from '../../../src/components/CollapsibleContent'; + + +:::danger + +Note: Use of this Llama-2 model is governed by the Meta license. +In order to download the model weights and tokenizer, please visit the [website](https://ai.meta.com/) and accept the license before requesting access. + +::: + +:::info + +We are actively enhancing this blueprint to incorporate improvements in observability, logging, and scalability aspects. + +::: + + +# Deploying Llama-2-13b Chat Model with Inferentia, Ray Serve and Gradio +Welcome to the comprehensive guide on deploying the [Meta Llama-2-13b chat](https://ai.meta.com/llama/#inside-the-model) model on Amazon Elastic Kubernetes Service (EKS) using [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). +In this tutorial, you will not only learn how to harness the power of Llama-2, but also gain insights into the intricacies of deploying large language models (LLMs) efficiently, particularly on [trn1/inf2](https://aws.amazon.com/machine-learning/neuron/) (powered by AWS Trainium and Inferentia) instances, such as `inf2.24xlarge` and `inf2.48xlarge`, +which are optimized for deploying and scaling large language models. + +### What is Llama-2? +Llama-2 is a pretrained large language model (LLM) trained on 2 trillion tokens of text and code. It is one of the largest and most powerful LLMs available today. Llama-2 can be used for a variety of tasks, including natural language processing, text generation, and translation. + +#### Llama-2-chat +Llama-2 is a remarkable language model that has undergone a rigorous training process. It starts with pretraining using publicly available online data. An initial version of Llama-2-chat is then created through supervised fine-tuning. +Following that, `Llama-2-chat` undergoes iterative refinement using Reinforcement Learning from Human Feedback (`RLHF`), which includes techniques like rejection sampling and proximal policy optimization (`PPO`). +This process results in a highly capable and fine-tuned language model that we will guide you to deploy and utilize effectively on **Amazon EKS** with **Ray Serve**. + +Llama-2 is available in three different model sizes: + +- **Llama-2-70b:** This is the largest Llama-2 model, with 70 billion parameters. It is the most powerful Llama-2 model and can be used for the most demanding tasks. +- **Llama-2-13b:** This is a medium-sized Llama-2 model, with 13 billion parameters. It is a good balance between performance and efficiency, and can be used for a variety of tasks. +- **Llama-2-7b:** This is the smallest Llama-2 model, with 7 billion parameters. It is the most efficient Llama-2 model and can be used for tasks that do not require the highest level of performance. + +### **Which Llama-2 model size should I use?** +The best Llama-2 model size for you will depend on your specific needs. and it may not always be the largest model for achieving the highest performance. It's advisable to evaluate your needs and consider factors such as computational resources, response time, and cost-efficiency when selecting the appropriate Llama-2 model size. The decision should be based on a comprehensive assessment of your application's goals and constraints. + +## Inference on Trn1/Inf2 Instances: Unlocking the Full Potential of Llama-2 +**Llama-2** can be deployed on a variety of hardware platforms, each with its own set of advantages. However, when it comes to maximizing the efficiency, scalability, and cost-effectiveness of Llama-2, [AWS Trn1/Inf2 instances](https://aws.amazon.com/ec2/instance-types/inf2/) shine as the optimal choice. + +**Scalability and Availability** +One of the key challenges in deploying large language models (`LLMs`) like Llama-2 is the scalability and availability of suitable hardware. Traditional `GPU` instances often face scarcity due to high demand, making it challenging to provision and scale resources effectively. +In contrast, `Trn1/Inf2` instances, such as `trn1.32xlarge`, `trn1n.32xlarge`, `inf2.24xlarge` and `inf2.48xlarge`, are purpose built for high-performance deep learning (DL) training and inference of generative AI models, including LLMs. They offer both scalability and availability, ensuring that you can deploy and scale your `Llama-2` models as needed, without resource bottlenecks or delays. + +**Cost Optimization:** +Running LLMs on traditional GPU instances can be cost-prohibitive, especially given the scarcity of GPUs and their competitive pricing. +**Trn1/Inf2** instances provide a cost-effective alternative. By offering dedicated hardware optimized for AI and machine learning tasks, Trn1/Inf2 instances allow you to achieve top-notch performance at a fraction of the cost. +This cost optimization enables you to allocate your budget efficiently, making LLM deployment accessible and sustainable. + +**Performance Boost** +While Llama-2 can achieve high-performance inference on GPUs, Neuron accelerators take performance to the next level. Neuron accelerators are purpose-built for machine learning workloads, providing hardware acceleration that significantly enhances Llama-2's inference speeds. This translates to faster response times and improved user experiences when deploying Llama-2 on Trn1/Inf2 instances. + +### Model Specification +The table provides information about the different sizes of Llama-2 models, their weights, and the hardware requirements for deploying them. This information can be used to design the infrastructure required to deploy any size of Llama-2 model. For example, if you want to deploy the `Llama-2-13b-chat` model, you will need to use an instance type with at least `26 GB` of total accelerator memory. + +| Model | Weights | Bytes | Parameter Size (Billions) | Total Accelerator Memory (GB) | Accelerator Memory Size for NeuronCore (GB) | Required Neuron Cores | Required Neuron Accelerators | Instance Type | tp_degree | +|-----------------|---------|-------|-----------------------------|------------------------------|---------------------------------------------|-----------------------|-----------------------------|-----------------|-----------| +| Meta/Llama-2-70b | float16 | 2 | 70 | 140 | 16 | 9 | 5 | inf2.48x | 24 | +| Meta/Llama-2-13b | float16 | 2 | 13 | 26 | 16 | 2 | 1 | inf2.24x | 12 | +| Meta/Llama-2-7b | float16 | 2 | 7 | 14 | 16 | 1 | 1 | inf2.24x | 12 | + +### Example usecase +A company wants to deploy a Llama-2 chatbot to provide customer support. The company has a large customer base and expects to receive a high volume of chat requests at peak times. The company needs to design an infrastructure that can handle the high volume of requests and provide a fast response time. + +The company can use Inferentia2 instances to scale its Llama-2 chatbot efficiently. Inferentia2 instances are specialized hardware accelerators for machine learning tasks. They can provide up to 20x better performance and up to 7x lower cost than GPUs for machine learning workloads. + +The company can also use Ray Serve to horizontally scale its Llama-2 chatbot. Ray Serve is a distributed framework for serving machine learning models. It can automatically scale your models up or down based on demand. + +To scale its Llama-2 chatbot, the company can deploy multiple Inferentia2 instances and use Ray Serve to distribute the traffic across the instances. This will allow the company to handle a high volume of requests and provide a fast response time. + +## Solution Architecture +In this section, we will delve into the architecture of our solution, which combines Llama-2 model, [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) and [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/) on Amazon EKS. + +![Llama-2-inf2](img/llama2-inf2.png) + +## Deploying the Solution +To get started with deploying `Llama-2-13b chat` on [Amazon EKS](https://aws.amazon.com/eks/), we will cover the necessary prerequisites and guide you through the deployment process step by step. +This includes setting up the infrastructure, deploying the **Ray cluster**, and creating the [Gradio](https://www.gradio.app/) WebUI app. + +Prerequisites}> +Before we begin, ensure you have all the prerequisites in place to make the deployment process smooth and hassle-free. +nsure that you have installed the following tools on your machine. + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) +3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) + +### Deploy + +Clone the repository + +```bash +git clone https://github.com/awslabs/data-on-eks.git +``` + +Navigate into one of the example directories and run `install.sh` script + +**Important Note:** Ensure that you update the region in the `variables.tf` file before deploying the blueprint. +Additionally, confirm that your local region setting matches the specified region to prevent any discrepancies. +For example, set your `export AWS_DEFAULT_REGION=""` to the desired region: + +```bash +cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh +./install.sh +``` + +### Verify the resources + +Verify the Amazon EKS Cluster + +```bash +aws eks --region us-west-2 describe-cluster --name trainium-inferentia +``` + +```bash +# Creates k8s config file to authenticate with EKS +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia + +kubectl get nodes # Output shows the EKS Managed Node group nodes +``` + + + +## Deploying the Ray Cluster with Llama-2-Chat Model +Once the `Trainium on EKS` Cluster is deployed, you can proceed to use `kubectl` to deploy the `ray-service-Llama-2.yaml`. + +In this step, we will deploy the Ray Serve cluster, which comprises one `Head Pod` on `x86 CPU` instances using Karpenter autoscaling, as well as `Ray workers` on `Inf2.48xlarge` instances, autoscaled by [Karpenter](https://karpenter.sh/). + +Let's take a closer look at the key files used in this deployment and understand their functionalities before proceeding with the deployment: + +- **ray_serve_Llama-2.py:** +This script uses FastAPI, Ray Serve, and PyTorch-based Hugging Face Transformers to create an efficient API for text generation using the [NousResearch/Llama-2-13b-chat-hf](https://huggingface.co/NousResearch/Llama-2-13b-chat-hf) language model. +Alternatively, users have the flexibility to switch to the [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) model. The script establishes an endpoint that accepts input sentences and efficiently generates text outputs, benefiting from Neuron acceleration for enhanced performance. With its high configurability, users can fine-tune model parameters to suit a wide range of natural language processing applications, including chatbots and text generation tasks. + +- **ray-service-Llama-2.yaml:** +This Ray Serve YAML file serves as a Kubernetes configuration for deploying the Ray Serve service, facilitating efficient text generation using the `Llama-2-13b-chat` model. +It defines a Kubernetes namespace named `Llama-2` to isolate resources. Within the configuration, the `RayService` specification, named `Llama-2-service`, is created and hosted within the `Llama-2` namespace. The `RayService` specification leverages the Python script `ray_serve_Llama-2.py` (copied into the Dockerfile located within the same folder) to create the Ray Serve service. +The Docker image used in this example is publicly available on Amazon Elastic Container Registry (ECR) for ease of deployment. +Users can also modify the Dockerfile to suit their specific requirements and push it to their own ECR repository, referencing it in the YAML file. + +### Deploy the Llama-2-Chat Model + +**Ensure the cluster is configured locally** +```bash +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia +``` + +**Deploy RayServe Cluster** + +```bash +cd ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2 +kubectl apply -f ray-service-llama2.yaml +``` + +Verify the deployment by running the following commands + +:::info + +The deployment process may take up to 10 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface. + +::: + +```text +$ kubectl get all -n llama2 + +NAME READY STATUS RESTARTS AGE +pod/llama2-service-raycluster-smqrl-head-4wlbb 0/1 ContainerCreating 0 77s +pod/service-raycluster-smqrl-worker-inf2-worker-group-wjxqq 0/1 Init:0/1 0 77s + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/llama2-service NodePort 172.20.246.48 8000:32138/TCP,52365:32653/TCP,8080:32604/TCP,6379:32739/TCP,8265:32288/TCP,10001:32419/TCP 78s + +$ kubectl get ingress -n llama2 + +NAME CLASS HOSTS ADDRESS PORTS AGE +llama2-ingress nginx * k8s-ingressn-ingressn-randomid-randomid.elb.us-west-2.amazonaws.com 80 2m4s + +``` + +Now, you can access the Ray Dashboard from the Load balancer URL below. + + http:///dashboard/#/serve + +If you don't have access to a public Load Balancer, you can use port-forwarding and browse the Ray Dashboard using localhost with the following command: + +```bash +kubectl port-forward svc/llama2-service 8265:8265 -n llama2 + +# Open the link in the browser +http://localhost:8265/ + +``` + +From this webpage, you will be able to monitor the progress of Model deployment, as shown in the image below: + +![Ray Dashboard](img/ray-dashboard.png) + +### To Test the Llama-2-Chat Model +Once you see the status of the model deployment is in `running` state then you can start using Llama-2-chat. + +You can use the following URL with a query added at the end of the URL. + + http:///serve/infer?sentence=what is data parallelism and tensor parallelisma and the diffrences + +You will see an output like this in your browser: + +![Chat Output](img/llama-2-chat-ouput.png) + +## Deploying the Gradio WebUI App +Discover how to create a user-friendly chat interface using [Gradio](https://www.gradio.app/) that integrates seamlessly with deployed models. + +Let's deploy Gradio app locally on your machine to interact with the LLama-2-Chat model deployed using RayServe. + +:::info + +The Gradio app interacts with the locally exposed service created solely for the demonstration. Alternatively, you can deploy the Gradio app on EKS as a Pod with Ingress and Load Balancer for wider accessibility. + +::: + +### Execute Port Forward to the llama2 Ray Service +First, execute a port forward to the Llama-2 Ray Service using kubectl: + +```bash +kubectl port-forward svc/llama2-service 8000:8000 -n llama2 +``` + +### Deploy Gradio WebUI Locally + +#### Create a Virtual Environment +Create a Python virtual environment in your machine for the Gradio application: + +```bash +cd ai-ml/trainium-inferentia/examples/gradio-ui +python3 -m venv .venv +source .venv/bin/activate +``` + +#### Install Gradio ChatBot app +Install all the Gradio WebUI app dependencies with pip + +```bash +pip install gradio requests +``` + +#### Invoke the WebUI +Run the Gradio WebUI using the following command: + +NOTE: `gradio-app.py` refers to the port forward url. e.g., `service_name = "http://localhost:8000" ` + +```bash +python gradio-app.py +``` + +You should see output similar to the following: + +```text +Using cache from ~/data-on-eks/ai-ml/trainium-inferentia/examples/gradio-ui/gradio_cached_examples/16' directory. If method or examples have changed since last caching, delete this folder to clear cache. + +Running on local URL: http://127.0.0.1:7860 + +To create a public link, set `share=True` in `launch()`. +``` + +#### 2.4. Access the WebUI from Your Browser +Open your web browser and access the Gradio WebUI by navigating to the following URL: + +http://127.0.0.1:7860 + +You should now be able to interact with the Gradio application from your local machine. + +![Gradio Llama-2 AI Chat](img/gradio-llama-ai-chat.png) + +## Conclusion +In conclusion, you will have successfully deployed the **Llama-2-13b chat** model on EKS with Ray Serve and created a chatGPT-style chat web UI using Gradio. +This opens up exciting possibilities for natural language processing and chatbot development. + +In summary, when it comes to deploying and scaling Llama-2, AWS Trn1/Inf2 instances offer a compelling advantage. +They provide the scalability, cost optimization, and performance boost needed to make running large language models efficient and accessible, all while overcoming the challenges associated with the scarcity of GPUs. +Whether you're building chatbots, natural language processing applications, or any other LLM-driven solution, Trn1/Inf2 instances empower you to harness the full potential of Llama-2 on the AWS cloud. + +## Cleanup +Finally, we'll provide instructions for cleaning up and deprovisioning the resources when they are no longer needed. + +**Step1:** Cancel the execution of the `python gradio-app.py` + +**Step2:** Delete Ray Cluster + +```bash +cd ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2 +kubectl delete -f ray-service-llama2.yaml +``` + +**Step3:** Cleanup the EKS Cluster +This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order. + +```bash +export AWS_DEAFULT_REGION="DEPLOYED_EKS_CLUSTER_REGION>" +cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x cleanup.sh +./cleanup.sh +``` diff --git a/website/docs/gen-ai/inference/StableDiffusion.md b/website/docs/gen-ai/inference/StableDiffusion.md new file mode 100644 index 000000000..f26334b04 --- /dev/null +++ b/website/docs/gen-ai/inference/StableDiffusion.md @@ -0,0 +1,12 @@ +--- +title: Stable Diffusion on GPUs +sidebar_position: 2 +--- + +:::info + +COMING SOON + +Please note that this section is currently a work in progress and will serve as a comprehensive collection of resources for running data and ML workloads on EKS. + +::: diff --git a/website/docs/gen-ai/inference/_category_.json b/website/docs/gen-ai/inference/_category_.json new file mode 100644 index 000000000..7c1602672 --- /dev/null +++ b/website/docs/gen-ai/inference/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Inference on EKS", + "position": 1, + "link": { + "type": "generated-index" + } +} diff --git a/website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png b/website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png new file mode 100644 index 000000000..4e667cf2a Binary files /dev/null and b/website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png differ diff --git a/website/docs/gen-ai/inference/img/llama-2-chat-ouput.png b/website/docs/gen-ai/inference/img/llama-2-chat-ouput.png new file mode 100644 index 000000000..b98458b69 Binary files /dev/null and b/website/docs/gen-ai/inference/img/llama-2-chat-ouput.png differ diff --git a/website/docs/gen-ai/inference/img/llama2-inf2.png b/website/docs/gen-ai/inference/img/llama2-inf2.png new file mode 100644 index 000000000..673c3c2bf Binary files /dev/null and b/website/docs/gen-ai/inference/img/llama2-inf2.png differ diff --git a/website/docs/gen-ai/inference/img/ray-dashboard.png b/website/docs/gen-ai/inference/img/ray-dashboard.png new file mode 100644 index 000000000..b47bf7b1c Binary files /dev/null and b/website/docs/gen-ai/inference/img/ray-dashboard.png differ diff --git a/website/docs/gen-ai/training/BERT-Large.md b/website/docs/gen-ai/training/BERT-Large.md new file mode 100644 index 000000000..216118ab2 --- /dev/null +++ b/website/docs/gen-ai/training/BERT-Large.md @@ -0,0 +1,12 @@ +--- +title: BERT-Large on Trainium +sidebar_position: 1 +--- + +:::info + +COMING SOON + +Please note that this section is currently a work in progress and will serve as a comprehensive collection of resources for running data and ML workloads on EKS. + +::: diff --git a/website/docs/gen-ai/training/_category_.json b/website/docs/gen-ai/training/_category_.json new file mode 100644 index 000000000..38d3dbcb7 --- /dev/null +++ b/website/docs/gen-ai/training/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Training on EKS", + "position": 2, + "link": { + "type": "generated-index" + } +} diff --git a/website/docs/workshop/intro.md b/website/docs/workshop/intro.md deleted file mode 100644 index 4ada5c471..000000000 --- a/website/docs/workshop/intro.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -sidebar_position: 1 -sidebar_label: Introduction ---- - -# Introduction - -:::info - -COMING SOON - -Please note that this section is currently a work in progress and will serve as a comprehensive workshop for hands-on learning with data and ML workloads on EKS. -It will include step-by-step instructions, code samples, exercises, and other resources to help you gain practical experience and deepen your understanding of running data and ML workloads on EKS. - -::: diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index bec1cf8c6..c65b4d0d2 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -61,6 +61,12 @@ const config = { position: 'left', label: 'Introduction', }, + { + type: 'doc', + docId: 'gen-ai/index', + position: 'left', + label: 'Gen AI' + }, { type: 'doc', docId: 'blueprints/amazon-emr-on-eks/index', @@ -79,12 +85,6 @@ const config = { position: 'left', label: 'Benchmarks' }, - { - type: 'doc', - docId: 'workshop/intro', - position: 'left', - label: 'Workshop' - }, { type: 'doc', docId: 'resources/intro', diff --git a/website/package-lock.json b/website/package-lock.json index 44f408daa..d9f79df02 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -194,11 +194,12 @@ } }, "node_modules/@babel/code-frame": { - "version": "7.21.4", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.21.4.tgz", - "integrity": "sha512-LYvhNKfwWSPpocw8GI7gpK2nq3HSDuEPC/uSYaALSJu9xjsalaaYFOq0Pwt5KmVqwEbZlDu81aLXwBOmD/Fv9g==", + "version": "7.22.13", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz", + "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==", "dependencies": { - "@babel/highlight": "^7.18.6" + "@babel/highlight": "^7.22.13", + "chalk": "^2.4.2" }, "engines": { "node": ">=6.9.0" @@ -260,11 +261,11 @@ } }, "node_modules/@babel/generator": { - "version": "7.21.4", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.21.4.tgz", - "integrity": "sha512-NieM3pVIYW2SwGzKoqfPrQsf4xGs9M9AIG3ThppsSRmO+m7eQhmI6amajKMUeIO37wFfsvnvcxQFx6x6iqxDnA==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz", + "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==", "dependencies": { - "@babel/types": "^7.21.4", + "@babel/types": "^7.23.0", "@jridgewell/gen-mapping": "^0.3.2", "@jridgewell/trace-mapping": "^0.3.17", "jsesc": "^2.5.1" @@ -367,9 +368,9 @@ } }, "node_modules/@babel/helper-environment-visitor": { - "version": "7.18.9", - "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz", - "integrity": "sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", + "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==", "engines": { "node": ">=6.9.0" } @@ -386,23 +387,23 @@ } }, "node_modules/@babel/helper-function-name": { - "version": "7.21.0", - "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.21.0.tgz", - "integrity": "sha512-HfK1aMRanKHpxemaY2gqBmL04iAPOPRj7DxtNbiDOrJK+gdwkiNRVpCpUJYbUT+aZyemKN8brqTOxzCaG6ExRg==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", + "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", "dependencies": { - "@babel/template": "^7.20.7", - "@babel/types": "^7.21.0" + "@babel/template": "^7.22.15", + "@babel/types": "^7.23.0" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-hoist-variables": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz", - "integrity": "sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==", + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", + "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", "dependencies": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" }, "engines": { "node": ">=6.9.0" @@ -523,28 +524,28 @@ } }, "node_modules/@babel/helper-split-export-declaration": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz", - "integrity": "sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==", + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", + "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", "dependencies": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-string-parser": { - "version": "7.19.4", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.19.4.tgz", - "integrity": "sha512-nHtDoQcuqFmwYNYPz3Rah5ph2p8PFeFCsZk9A/48dPc/rGocJ5J3hAAZ7pb76VWX3fZKu+uEr/FhH5jLx7umrw==", + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz", + "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw==", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-identifier": { - "version": "7.19.1", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz", - "integrity": "sha512-awrNfaMtnHUr653GgGEs++LlAvW6w+DcPrOliSMXWCKo597CwL5Acf/wWdNkf/tfEQE3mjkeD1YOVZOUV/od1w==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", "engines": { "node": ">=6.9.0" } @@ -585,12 +586,12 @@ } }, "node_modules/@babel/highlight": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.18.6.tgz", - "integrity": "sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz", + "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==", "dependencies": { - "@babel/helper-validator-identifier": "^7.18.6", - "chalk": "^2.0.0", + "@babel/helper-validator-identifier": "^7.22.20", + "chalk": "^2.4.2", "js-tokens": "^4.0.0" }, "engines": { @@ -598,9 +599,9 @@ } }, "node_modules/@babel/parser": { - "version": "7.21.4", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.21.4.tgz", - "integrity": "sha512-alVJj7k7zIxqBZ7BTRhz0IqJFxW1VJbm6N8JbcYhQ186df9ZBPbZBmWSqAMXwHGsCJdYks7z/voa3ibiS5bCIw==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz", + "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==", "bin": { "parser": "bin/babel-parser.js" }, @@ -1832,31 +1833,31 @@ } }, "node_modules/@babel/template": { - "version": "7.20.7", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.20.7.tgz", - "integrity": "sha512-8SegXApWe6VoNw0r9JHpSteLKTpTiLZ4rMlGIm9JQ18KiCtyQiAMEazujAHrUS5flrcqYZa75ukev3P6QmUwUw==", + "version": "7.22.15", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz", + "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==", "dependencies": { - "@babel/code-frame": "^7.18.6", - "@babel/parser": "^7.20.7", - "@babel/types": "^7.20.7" + "@babel/code-frame": "^7.22.13", + "@babel/parser": "^7.22.15", + "@babel/types": "^7.22.15" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/traverse": { - "version": "7.21.4", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.21.4.tgz", - "integrity": "sha512-eyKrRHKdyZxqDm+fV1iqL9UAHMoIg0nDaGqfIOd8rKH17m5snv7Gn4qgjBoFfLz9APvjFU/ICT00NVCv1Epp8Q==", - "dependencies": { - "@babel/code-frame": "^7.21.4", - "@babel/generator": "^7.21.4", - "@babel/helper-environment-visitor": "^7.18.9", - "@babel/helper-function-name": "^7.21.0", - "@babel/helper-hoist-variables": "^7.18.6", - "@babel/helper-split-export-declaration": "^7.18.6", - "@babel/parser": "^7.21.4", - "@babel/types": "^7.21.4", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", + "dependencies": { + "@babel/code-frame": "^7.22.13", + "@babel/generator": "^7.23.0", + "@babel/helper-environment-visitor": "^7.22.20", + "@babel/helper-function-name": "^7.23.0", + "@babel/helper-hoist-variables": "^7.22.5", + "@babel/helper-split-export-declaration": "^7.22.6", + "@babel/parser": "^7.23.0", + "@babel/types": "^7.23.0", "debug": "^4.1.0", "globals": "^11.1.0" }, @@ -1865,12 +1866,12 @@ } }, "node_modules/@babel/types": { - "version": "7.21.4", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.21.4.tgz", - "integrity": "sha512-rU2oY501qDxE8Pyo7i/Orqma4ziCOrby0/9mvbDUGEfvZjb279Nk9k19e2fiCxHbRRpY2ZyrgW1eq22mvmOIzA==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", "dependencies": { - "@babel/helper-string-parser": "^7.19.4", - "@babel/helper-validator-identifier": "^7.19.1", + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", "to-fast-properties": "^2.0.0" }, "engines": { @@ -11217,9 +11218,9 @@ } }, "node_modules/postcss": { - "version": "8.4.25", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.25.tgz", - "integrity": "sha512-7taJ/8t2av0Z+sQEvNzCkpDynl0tX3uJMCODi6nT3PfASC7dYCWV9aQ+uiCf+KBD4SEFcu+GvJdGdwzQ6OSjCw==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "funding": [ { "type": "opencollective", diff --git a/website/sidebars.js b/website/sidebars.js index 7dc8c0bd4..e69016bab 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -15,12 +15,11 @@ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure // docs: [{type: 'autogenerated', dirName: '.'}], - // But you can create a sidebar manually + genai: [{type: 'autogenerated', dirName: 'gen-ai'}], blueprints: [{type: 'autogenerated', dirName: 'blueprints'}], bestpractices: [{type: 'autogenerated', dirName: 'bestpractices'}], benchmarks: [{type: 'autogenerated', dirName: 'benchmarks'}], - workshop: [{type: 'autogenerated', dirName: 'workshop'}], resources: [{type: 'autogenerated', dirName: 'resources'}], }; diff --git a/workshop/emr-eks/modules/vpc/main.tf b/workshop/emr-eks/modules/vpc/main.tf index 5940b3c6f..8a96a22ce 100644 --- a/workshop/emr-eks/modules/vpc/main.tf +++ b/workshop/emr-eks/modules/vpc/main.tf @@ -11,6 +11,9 @@ locals { data "aws_availability_zones" "available" {} +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" diff --git a/workshop/examples/emr-eks/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml b/workshop/examples/emr-eks/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml index 1d4a43e47..76a5b9825 100644 --- a/workshop/examples/emr-eks/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml +++ b/workshop/examples/emr-eks/fsx-for-lustre/fsx-dynamic-pvc-shuffle-storage/driver-pod-template.yaml @@ -27,7 +27,7 @@ spec: volumeMounts: - name: spark-local-dir-1 mountPath: /dynamic # FSx Scratch 1 filesystem for executors scratch space - command: ["sh", "-c", "chmod 777 /dynamic", "chown -hR +999:+1000 /dynamic"] + command: ["sh", "-c", "chmod 744 /dynamic", "chown -hR +999:+1000 /dynamic"] tolerations: - key: "spark-compute-optimized" diff --git a/workshop/examples/emr-eks/fsx-for-lustre/fsx-static-pvc-shuffle-storage/driver-pod-template.yaml b/workshop/examples/emr-eks/fsx-for-lustre/fsx-static-pvc-shuffle-storage/driver-pod-template.yaml index c4b357911..7de1a22af 100644 --- a/workshop/examples/emr-eks/fsx-for-lustre/fsx-static-pvc-shuffle-storage/driver-pod-template.yaml +++ b/workshop/examples/emr-eks/fsx-for-lustre/fsx-static-pvc-shuffle-storage/driver-pod-template.yaml @@ -25,7 +25,7 @@ spec: volumeMounts: - name: spark-local-dir-1 mountPath: /static - command: ["sh", "-c", "chmod -R 777 /static", "chown -hR +999:+1000 /static/data"] + command: ["sh", "-c", "chmod -R 744 /static", "chown -hR +999:+1000 /static/data"] tolerations: - key: "spark-compute-optimized"