From b25881ef2ff289aa17b20c6c3c4aa536e6710464 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 30 May 2021 20:26:45 +0200
Subject: [PATCH 01/43] deployer: accept provider=none for already setup k8s
 cluster access

---
 deployer/cluster.py          | 2 ++
 deployer/cluster.schema.yaml | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/deployer/cluster.py b/deployer/cluster.py
index 29d6366b64..91de692529 100644
--- a/deployer/cluster.py
+++ b/deployer/cluster.py
@@ -31,6 +31,8 @@ def auth(self):
             yield from self.auth_azure()
         elif self.spec["provider"] == "kubeconfig":
             yield from self.auth_kubeconfig()
+        elif self.spec['provider'] == 'none':
+            yield
         else:
             raise ValueError(f'Provider {self.spec["provider"]} not supported')
 
diff --git a/deployer/cluster.schema.yaml b/deployer/cluster.schema.yaml
index e574dea7e6..19b2f58de2 100644
--- a/deployer/cluster.schema.yaml
+++ b/deployer/cluster.schema.yaml
@@ -27,7 +27,8 @@ properties:
       Cloud provider this cluster is running on. Used to perform
       authentication against the cluster. Currently supports gcp, aws, azure,
       and raw kubeconfig files.
-    enum:
+    enum: 
+      - none
       - gcp
       - kubeconfig
       - aws

From fbc0ddcf3e0de4ee7378c1517b281f2a2ffa768f Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Mon, 31 May 2021 03:14:35 +0200
Subject: [PATCH 02/43] jmte: add 2i2c config for jmte deployment

---
 shared/deployer/jmte.cluster.yaml | 210 ++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 shared/deployer/jmte.cluster.yaml

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
new file mode 100644
index 0000000000..36efe2bc81
--- /dev/null
+++ b/shared/deployer/jmte.cluster.yaml
@@ -0,0 +1,210 @@
+name: jmte
+provider: none
+# kubeconfig:
+#   file: secrets/jmte.yaml
+hubs:
+  - name: prod
+    domain: hub.jupytearth.org
+    template: daskhub
+    auth0:
+      connection: github
+    config: &config
+
+      basehub:
+        # Cloudformation: The EFS filesystem was created by cloudformation.
+        #
+        nfsPVC:
+          enabled: true
+          nfs:
+            # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+            mountOptions:
+              - rsize=1048576
+              - wsize=1048576
+              - timeo=600
+              - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+              - retrans=2
+              - noresvport
+            serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
+            # baseShareName is required to be just "/" so that we can create
+            # various sub folders in the filesystem that our PV to access the
+            # NFS server can reference successfully as it isn't supported to
+            # access a not yet existing folder. This creation is automated by
+            # the nfs-share-creator resource part of the basehub Helm chart.
+            baseShareName: /
+
+
+
+        jupyterhub:
+          custom:
+            homepage:
+              templateVars:
+                org:
+                  name: Jupyter meets the Earth
+                  logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
+                  url: https://jupytearth.org
+                designed_by:
+                  name: 2i2c
+                  url: https://2i2c.org
+                operated_by:
+                  name: 2i2c
+                  url: https://2i2c.org
+                funded_by:
+                  name: Jupyter meets the Earth
+                  url: https://jupytearth.org
+
+          singleuser:
+            # Eksctl: The service account was created by eksctl.
+            #
+            serviceAccountName: &user-sa s3-full-access
+
+            # cmd: I've experimented with these settings to get a JupyterLab RTC
+            #      setup functioning. It currently is, but is this what makes
+            #      sense to get it to function?
+            #
+            #      ref: https://github.com/jupyterlab-contrib/jupyterlab-link-share/issues/10#issuecomment-851899758
+            #      ref: https://github.com/jupyterlab/jupyterlab/blob/1c8ff104a99e294265e6cf476dcb46279b0c3593/binder/jupyter_notebook_config.py#L39
+            #
+            #      Note the default in z2jh is jupyterhub-singleuser.
+            cmd:
+              - jupyterhub-singleuser
+              - --LabApp.collaborative=True
+              - --ServerApp.allow_remote_access=True
+
+            extraEnv:
+              # SCRATCH_BUCKET / PANGEO_SCRATCH are environment variables that
+              # help users write notebooks and such referencing this environment
+              # variable in a way that will work between users.
+              #
+              # $(ENV_VAR) will by evaluated by k8s automatically
+              #
+              # Cloudformation: The s3 bucket was created by cloudformation.
+              #
+              SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER)
+              PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER)
+
+            initContainers:
+              # Need to explicitly fix ownership here, since EFS doesn't do anonuid
+              - name: volume-mount-ownership-fix
+                image: busybox
+                command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan && ls -lhd /home/jovyan"]
+                securityContext:
+                  runAsUser: 0
+                volumeMounts:
+                  - name: home
+                    mountPath: /home/jovyan
+                    subPath: "{username}"
+
+            image:
+              name: pangeo/pangeo-notebook
+              tag: "2021.05.15" # https://hub.docker.com/r/pangeo/pangeo-notebook/tags
+
+            profileList:
+              - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
+                kubespawner_override:
+                  cpu_guarantee: 0.225
+                  mem_guarantee: 0.875G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "4" }
+              - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
+                kubespawner_override:
+                  cpu_guarantee: 0.875
+                  mem_guarantee: 3.5G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "4" }
+              - display_name: "Medium: 4 CPU, 16 GB"
+                kubespawner_override:
+                  cpu_guarantee: 3.5
+                  mem_guarantee: 14G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "4" }
+              - display_name: "Large: 16 CPU, 64 GB"
+                kubespawner_override:
+                  mem_guarantee: 56G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "16" }
+              - display_name: "Massive: 64 CPU, 256 GB"
+                kubespawner_override:
+                  mem_guarantee: 224G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "64" }
+
+          proxy:
+            # proxy notes:
+            #
+            # - Revert basehubs overrides as we don't install ingress-nginx and
+            #   cert-manager yet, and therefore should use
+            #   service.type=LoadBalancer instead of service.type=ClusterIP.
+            #   Along with this, we also make use of the autohttps system that
+            #   requires us to configure an letsencrypt email.
+            #
+            service:
+              type: LoadBalancer
+            https:
+              enabled: true
+              type: letsencrypt
+              letsencrypt:
+                contactEmail: erik@sundellopensource.se
+
+          hub:
+            config:
+              Authenticator:
+                allowed_users: &users
+                  - abbyazari       # Abby Azari
+                  - andersy005      # Anderson Banihirwe
+                  - consideratio    # Erik Sundell
+                  - elliesch        # Ellie Abrahams
+                  - EMscience       # Edom Moges
+                  - espg            # Shane Grigsby
+                  - facusapienza21  # Facundo Sapienza
+                  - fperez          # Fernando Pérez
+                  - kmpaul          # Kevin Paul
+                  - lrennels        # Lisa Rennels
+                  - mrsiegfried     # Matthew Siegfried
+                  - tsnow03         # Tasha Snow
+                  - whyjz           # Whyjay Zheng
+                  - yuvipanda       # Yuvi Panda
+                admin_users: *users
+            allowNamedServers: true
+            networkPolicy:
+              # FIXME: Required for dask gateway 0.9.0. It is fixed but a Helm
+              #        chart of newer version is not yet released.
+              enabled: false
+
+
+
+      dask-gateway:
+        # dask-gateway notes:
+        #
+        # - Explicitly unset daskhub's nodeSelectors for all pods except the
+        #   worker pods. The tolerations applied in the basehub config to all
+        #   non-worker pods in dask-gateway will provide a preferred affinity
+        #   towards suitable nodes without needing to have a label on them. Then
+        #   we use the node label "k8s.dask.org/node-purpose: worker"
+        #   specifically for enforce workers to schedule on such nodes.
+        #
+        traefik:
+          nodeSelector: null
+        controller:
+          nodeSelector: null
+        gateway:
+          nodeSelector: null
+          backend:
+            scheduler:
+              extraPodConfig:
+                nodeSelector:
+                  hub.jupyter.org/node-purpose: user
+                serviceAccountName: *user-sa
+            worker:
+              extraPodConfig:
+                nodeSelector:
+                  k8s.dask.org/node-purpose: worker
+                serviceAccountName: *user-sa
+
+          extraConfig:
+            idle: |
+              # timeout after 30 minutes of inactivity
+              c.KubeClusterConfig.idle_timeout = 1800
+            limits: |
+              # per Dask cluster limits.
+              c.ClusterConfig.cluster_max_cores = 256
+              c.ClusterConfig.cluster_max_memory = "1028G"

From 4e5051e2ca92f806eaecd4113e740b8dc55efb81 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Mon, 31 May 2021 03:17:06 +0200
Subject: [PATCH 03/43] jmte: add aws infra (eksctl/cloudformation)

---
 eksctl/README.md                  |  74 ++++++++
 eksctl/cloudformation-extras.yaml | 293 ++++++++++++++++++++++++++++++
 eksctl/eksctl-cluster-config.yaml | 274 ++++++++++++++++++++++++++++
 3 files changed, 641 insertions(+)
 create mode 100644 eksctl/README.md
 create mode 100644 eksctl/cloudformation-extras.yaml
 create mode 100644 eksctl/eksctl-cluster-config.yaml

diff --git a/eksctl/README.md b/eksctl/README.md
new file mode 100644
index 0000000000..7a08315ee3
--- /dev/null
+++ b/eksctl/README.md
@@ -0,0 +1,74 @@
+### Setup of k8s cluster via eksctl
+
+TODO describe...
+
+### Setup of extras via cloudformation
+
+TODO describe...
+
+### Setup of cluster-autoscaler in the k8s cluster
+
+`eksctl` doesn't automatically install a cluster-autoscaler and it is not part
+of a EKS based k8s cluster by itself, so it needs to be manually installed. The
+cluster-autoscaler will need permissions to do its job though, and for that we
+use some flags in our eksctl config file and then we install it with a Helm
+chart.
+
+#### eksctl configuration for cluster-autoscaler
+
+We need our eksctl-cluster-config.yaml to:
+
+1. Declare `nodeGroups.*.iam.withAddonPolicies.autoScaler=true`.
+   
+   I believe doing so is what makes the following tags automatically be applied
+   on node groups, which is required by the cluster-autoscaler to detect them.
+
+   ```
+   k8s.io/cluster-autoscaler/<cluster-name>
+   k8s.io/cluster-autoscaler/enabled
+   ```
+
+2. Declare additional tags for labels/taints.
+
+   ```yaml
+   nodeGroups:
+     - name: worker-xlarge
+       labels:
+         k8s.dask.org/node-purpose: worker
+       taints:
+         k8s.dask.org_dedicated: worker:NoSchedule
+
+       # IMPORTANT: we also provide these tags alongside the labels/taints
+       #            to help the cluster-autoscaler do its job.
+       #
+       tags:
+         k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: worker
+         k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org_dedicated: worker:NoSchedule
+   ```
+
+
+#### Installation of cluster-autoscaler
+
+We rely on the [cluster-autoscaler Helm chart](https://github.com/kubernetes/autoscaler/tree/master/charts/cluster-autoscaler) to manage the k8s resources for the cluster-autoscaler we need to manually complement the k8s cluster with.
+
+```
+helm upgrade cluster-autocaler cluster-autoscaler \
+    --install \
+    --repo https://kubernetes.github.io/autoscaler \
+    --version 9.9.2 \
+    --namespace kube-system \
+    --set autoDiscovery.clusterName=jmte \
+    --set awsRegion=us-west-2
+```
+
+### Misc
+
+- Create a auth0 application for github
+- Update dns record ([jupytearth.org is managed on GCP by Erik](https://console.cloud.google.com/net-services/dns/zones/jupytearth-org/details?folder=&organizationId=&project=domains-sos))
+
+### FIXME: Open questions
+
+- How is cluster-autoscaler acquiring the permissions it needs? Is it by being
+  located on the node where we have
+  `nodeGroups.*.iam.withAddonPolicies.autoScaler=true`? Then we have ended up
+  granting permission to all pods on all nodes that are too high.
diff --git a/eksctl/cloudformation-extras.yaml b/eksctl/cloudformation-extras.yaml
new file mode 100644
index 0000000000..c80f541be8
--- /dev/null
+++ b/eksctl/cloudformation-extras.yaml
@@ -0,0 +1,293 @@
+# Cloudformation is like Terraform but specific to AWS, in other words, it
+# allows you to declare some cloud infrastructure in configuration files that
+# you can then request be setup on AWS by a CLI (aws cloudformation deploy). A
+# quick intro is available here: https://www.youtube.com/watch?v=Omppm_YUG2g
+#
+# This cloudformation configuration contain what we need to complement the
+# eksctl created k8s cluster for the deployer script to run in our CI system.
+
+# Goals:
+#
+#   1. For us maintainers to be able to encrypt/decrypt secret content with
+#      mozilla/sops directly, but also let hubploy use mozilla/sops to decrypt
+#      them using a AWS service account. This will require AWS KMS to be setup.
+#   2. To enable hubploy to build and push docker images to our default AWS
+#      container registry (<aws_account_id>.dkr.ecr.<region>.amazonaws.com).
+#
+# Required AWS infrastructure to create:
+#
+#   1. A dedicated service account (AWS::IAM::User), with an associated
+#      AccessKey (AWS::IAM::AccessKey).
+#   2. A KMS service (AWS::KMS::Key), and permissions to use it to the dedicated
+#      service account.
+#   3. Permissions for the dedicated service account to push to the default
+#      container registry.
+#   4. Permissions for the dedicated service account to work against the k8s
+#      cluster created by eksctl, which use cloudformation under the hood.
+#
+#   5. FUTURE: s3 stuff?
+#   6. FUTURE: EFS stuff?
+#
+
+# Operations:
+#
+# Create/Update:
+#   aws cloudformation deploy --stack-name=jmte-extras --template-file=./cloudformation-extras.yaml --capabilities=CAPABILITY_NAMED_IAM
+#
+# Inspect:
+#   aws cloudformation describe-stacks --stack-name=jmte-extras
+#
+# Delete:
+#   aws cloudformation delete-stack --stack-name=jmte-extras
+#
+
+# References:
+#
+# AWS Cloudformation console:
+#   https://console.aws.amazon.com/cloudformation/home
+#
+# AWS Cloudformation intro:
+#   https://www.youtube.com/watch?v=Omppm_YUG2g
+#
+# AWS IAM intro:
+#   https://www.youtube.com/watch?v=3A5hRIT8zdo
+#
+# The starting point for me:
+#   https://medium.com/mercos-engineering/secrets-as-a-code-with-mozilla-sops-and-aws-kms-d069c45ae1b9
+#
+# Reference on !Join:
+#   https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/intrinsic-function-reference-join.html
+#
+
+# The parameters we need to provide to create this cloudformation stack
+Parameters:
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/parameters-section-structure.html
+  EksClusterName:
+    Type: String
+    Default: jmte
+  EcrRepositoryName:
+    Type: String
+    Default: jmte/user-env
+  IamUserName:
+    Type: String
+    Default: ci
+  IamRoleNameEcr:
+    Type: String
+    Default: ci-ecr
+  IamRoleNameEks:
+    Type: String
+    Default: ci-eks
+
+
+# The resources we want to be created as part of this cloudformation stack
+Resources:
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-iam-user.html
+  IamUser:
+    Type: AWS::IAM::User
+    Properties:
+      UserName: !Ref IamUserName
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-iam-accesskey.html
+  IamAccessKey:
+    Type: AWS::IAM::AccessKey
+    Properties:
+      UserName: !Ref IamUser
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-iam-role.html
+  IamRoleEcr:
+    Type: AWS::IAM::Role
+    Properties:
+      RoleName: !Ref IamRoleNameEcr
+      Policies:
+        - PolicyName: EcrAccess
+          PolicyDocument:
+            Version: 2012-10-17
+            Statement:
+              # I have failed restricting this further...
+              - Effect: Allow
+                Action:
+                  - ecr:*
+                Resource: "*"
+      AssumeRolePolicyDocument:
+        Version: 2012-10-17
+        Statement:
+          - Sid: AllowRoleToBeAssumedByOurUser
+            Effect: Allow
+            Principal:
+              AWS: !Join
+                - ''
+                - - 'arn:aws:iam::'
+                  - !Ref AWS::AccountId
+                  - :user/
+                  - !Ref IamUser
+            Action:
+              - sts:AssumeRole
+  IamRoleEks:
+    Type: AWS::IAM::Role
+    Properties:
+      RoleName: !Ref IamRoleNameEks
+      Policies:
+        - PolicyName: EksAccess
+          PolicyDocument:
+            Version: 2012-10-17
+            Statement:
+              - Effect: Allow
+                Action:
+                  - eks:DescribeCluster
+                Resource: !Join
+                - ''
+                - - 'arn:aws:eks:'
+                  - !Ref AWS::Region
+                  - ':'
+                  - !Ref AWS::AccountId
+                  - ':cluster/'
+                  - !Ref EksClusterName
+      AssumeRolePolicyDocument:
+        Version: 2012-10-17
+        Statement:
+          - Sid: AllowRoleToBeAssumedByOurUser
+            Effect: Allow
+            Principal:
+              AWS: !Join
+                - ''
+                - - 'arn:aws:iam::'
+                  - !Ref AWS::AccountId
+                  - :user/
+                  - !Ref IamUser
+            Action:
+              - sts:AssumeRole
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-efs-filesystem.html
+  #
+  EfsFileSystem:
+    Type: AWS::EFS::FileSystem
+    Properties:
+      BackupPolicy:
+        Status: ENABLED
+      Encrypted: true
+  EfsMountTarget0:
+    Type: AWS::EFS::MountTarget
+    Properties:
+      FileSystemId: !GetAtt EfsFileSystem.FileSystemId
+      SecurityGroups:
+        - {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
+      SubnetId: { "Fn::Select": [0, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+  EfsMountTarget1:
+    Type: AWS::EFS::MountTarget
+    Properties:
+      FileSystemId: !GetAtt EfsFileSystem.FileSystemId
+      SecurityGroups:
+        - {"Fn::ImportValue": {"Fn::Sub" : "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
+      SubnetId: { "Fn::Select": [1, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+  EfsMountTarget2:
+    Type: AWS::EFS::MountTarget
+    Properties:
+      FileSystemId: !GetAtt EfsFileSystem.FileSystemId
+      SecurityGroups:
+        - {"Fn::ImportValue": {"Fn::Sub" : "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
+      SubnetId: { "Fn::Select": [2, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-s3-bucket.html
+  #
+  S3Bucket:
+    Type: AWS::S3::Bucket
+    Properties:
+      AccessControl: Private
+      BucketName: !Sub ${EksClusterName}-scratch
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-ecr-repository.html
+  #
+  EcrRepository:
+    Type: AWS::ECR::Repository
+    Properties:
+      RepositoryName: !Ref EcrRepositoryName
+      RepositoryPolicyText:
+        Version: 2008-10-17
+        Statement:
+          - Sid: Allow pull for who are authenticated with our account
+            Effect: Allow
+            Principal:
+              AWS: !Ref AWS::AccountId
+            Action:
+              - ecr:GetDownloadUrlForLayer
+              - ecr:BatchGetImage
+
+  # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-kms-key.html
+  #
+  KmsKey:
+    Type: AWS::KMS::Key
+    Properties:
+      Description: Enables mozilla/sops to encrypt/decrypt secrets just in time.
+      KeyPolicy:
+        Version: 2012-10-17
+        Statement:
+          - Sid: Enable Root IAM User Permissions
+            Effect: Allow
+            Principal:
+              AWS: !Join
+                - ''
+                - - 'arn:aws:iam::'
+                  - !Ref AWS::AccountId
+                  - :root
+            Action: 'kms:*'
+            Resource: '*'
+          - Sid: Enable User Permissions
+            Effect: Allow
+            Principal:
+              AWS: !Join
+                - ''
+                - - 'arn:aws:iam::'
+                  - !Ref AWS::AccountId
+                  - :user/
+                  - !Ref IamUser
+            Action:
+              - "kms:DescribeKey"
+              - "kms:Encrypt"
+              - "kms:Decrypt"
+              - "kms:ReEncrypt*"
+              - "kms:GenerateDataKey"
+              - "kms:GenerateDataKeyWithoutPlaintext"
+            Resource: '*'
+
+
+# The relevant information from the created resources.
+Outputs:
+  # A Role to control the k8s cluster
+  IamRoleEksArn:
+    Value: !GetAtt IamRoleEks.Arn
+    Description: The role with permission to work against k8s.
+
+  # A role to control the docker registry
+  IamRoleEcrArn:
+    Value: !GetAtt IamRoleEcr.Arn
+    Description: |
+      The Role with permission to push to our image registry.
+  EcrRepository:
+    Value: !Join
+      - ''
+      - - !Ref AWS::AccountId
+        - .dkr.ecr.
+        - !Ref AWS::Region
+        - .amazonaws.com/
+        - !Ref EcrRepositoryName
+    Description: The image repository for the user environment image.
+
+  S3Bucket:
+    Value: !Ref S3Bucket
+    Description: An S3 private scratch bucket that all users share read/write permission to.
+
+  EfsFileSystemId:
+    Value: !GetAtt EfsFileSystem.FileSystemId
+
+  # The KMS system is not in use currently! Instead we use the 2i2c centralized
+  # Google KMS keychain instead to have one less account to manage.
+  KmsKeyArn:
+    Value: !GetAtt KmsKey.Arn
+    Description: Use this to set creation_rules[0].kms in .sops.yaml
+
+  AwsAccessKeyId:
+    Value: !Ref IamAccessKey
+    Description: Use this to set AWS_ACCESS_KEY_ID as a GitHub project secret
+  AwsSecretAccessKey:
+    Value: !GetAtt IamAccessKey.SecretAccessKey
+    Description: Use this to set AWS_SECRET_ACCESS_KEY as a GitHub project secret
diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
new file mode 100644
index 0000000000..787c9e9fd7
--- /dev/null
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -0,0 +1,274 @@
+# This eksctl configuration file represents the cluster and node groups for use
+# by the cluster.
+# ref: https://eksctl.io/usage/schema/
+#
+# Cluster operations:
+# ref: https://eksctl.io/usage/cluster-upgrade/
+#
+#   create:   eksctl create cluster --config-file=eksctl-cluster-config.yaml --set-kubeconfig-context
+#   upgrade:  eksctl upgrade cluster --config-file=eksctl-cluster-config.yaml
+#   delete:   eksctl delete cluster --config-file=eksctl-cluster-config.yaml
+#
+# Node group operations:
+# ref: https://eksctl.io/usage/managing-nodegroups/
+#
+#   eksctl get nodegroups --cluster jmte
+#
+#   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --approve
+#   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*"
+#   eksctl delete nodegroup --cluster jmte --name core-a
+#   eksctl create nodegroup --cluster jmte --name core-a
+#
+#   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --approve && eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*"
+#
+# Attribution: this was based on @yuvipanda's work in 2i2c! <3
+# ref: https://github.com/2i2c-org/pangeo-hubs/blob/8e552bc198d8339efe8c003cb847849255e8f8ed/aws/eksctl-config.yaml
+#
+
+
+
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+metadata:
+  name: jmte
+  # region:
+  #   The region was chosen to to us-west-2 (Oregon) to be close to a CMIP-6
+  #   dataset.
+  #
+  region: us-west-2
+  version: "1.19"
+  tags:
+    2i2c.org/project: jmte
+
+# availabilityZones:
+#   For the EKS control plane, arbitrary chosen but made explicit to ensure we
+#   can locate the node pool on an AZ where the EKS control plane exist as
+#   required.
+#
+availabilityZones: [us-west-2d, us-west-2b, us-west-2a]
+
+
+
+# This section will create additional k8s ServiceAccount's that are coupled with
+# AWS Role's. By declaring pods to use them, you can grant these pods the
+# associated permissions. For this deployment, we create a k8s ServiceAccount
+# with Full S3 credentials which we then also declare user pods and dask worker
+# pods will make use of.
+#
+iam:
+  withOIDC: true        # https://eksctl.io/usage/security/#withoidc
+  # serviceAccounts like nodeGroups etc can be managed directly with eksctl, for
+  # more information, see: https://eksctl.io/usage/iamserviceaccounts/
+  #
+  #   eksctl create iamserviceaccount --config-file=eksctl-cluster-config.yaml
+  #
+  serviceAccounts:
+    - metadata:
+        name: s3-full-access
+        namespace: prod
+        labels:
+          aws-usage: application
+      attachPolicyARNs:
+        - arn:aws:iam::aws:policy/AmazonS3FullAccess
+    - metadata:
+        name: s3-full-access
+        namespace: staging
+        labels:
+          aws-usage: application
+      attachPolicyARNs:
+        - arn:aws:iam::aws:policy/AmazonS3FullAccess
+
+
+
+# Choose the type of node group?
+# - nodeGroups cannot be updated but must be recreated on changes:
+#   https://eksctl.io/usage/managing-nodegroups/#nodegroup-immutability
+# - managedNodeGroups cannot scale to zero:
+#   https://github.com/aws/containers-roadmap/issues/724
+#
+# Choosing instance type?
+# - Maximum pods: https://github.com/awslabs/amazon-eks-ami/blob/master/files/eni-max-pods.txt
+# - Node specs:   https://aws.amazon.com/ec2/instance-types/
+# - Cost:         https://ec2pricing.net/
+#
+# Management advice:
+# - Always use a suffix for node group names that you can replace with something
+#   to create a new node group and delete the old. You will run into issues if
+#   you name it "core" and "core-a" instead of "core-a" and "core-b", such as
+#   when deleting "core" you end up draining both node groups.
+#
+nodeGroups:
+  - name: core-a
+    availabilityZones: [us-west-2d]   # aws ec2 describe-availability-zones --region <region-name>
+    instanceType: m5.large   # 28 pods, 2 cpu, 8 GB
+    minSize: 0
+    maxSize: 2
+    desiredCapacity: 1
+    volumeSize: 80
+    labels:
+      hub.jupyter.org/node-purpose: core
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: core
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
+  # 57 pods, 4 cpu, 16 GB (Intel, 10 GBits network)
+  - name: user-a-4
+    availabilityZones: &user-availabilityZones [us-west-2d]
+    instanceType: &user-instanceType m5.xlarge
+    minSize: &user-minSize 0
+    maxSize: &user-maxSize 4
+    desiredCapacity: &user-desiredCapacity 0
+    volumeSize: &user-volumeSize 80
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "4"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "4"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+    iam: &user-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
+  # 233 pods, 16 cpu, 64 GB (Intel, 10 GBits network)
+  - name: user-a-16
+    availabilityZones: *user-availabilityZones
+    instanceType: m5.4xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "16"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "16"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+    iam: *user-iam
+
+  # 736 pods, 64 cpu, 256 GB (Intel, 20 GBits network)
+  - name: user-a-64
+    availabilityZones: *user-availabilityZones
+    instanceType: m5.16xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "64"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "64"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+    iam: *user-iam
+
+
+
+  # Worker node pools using cheaper spot instances that are temporary.
+  #
+  #   References:
+  #   - About spotAllocationStrategy: https://aws.amazon.com/blogs/compute/introducing-the-capacity-optimized-allocation-strategy-for-amazon-ec2-spot-instances/
+  #   - About instancesDistribution:  https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-autoscaling-autoscalinggroup-instancesdistribution.html
+  #
+  # Note: instance types with different capacity (CPU/Memory) must have
+  #       different node pools for the cluster autoscaler to work properly.
+  #
+  #   "Due to the Cluster Autoscaler’s limitations (more on that in the next
+  #   section) on which Instance type to expand, it’s important to choose
+  #   instances of the same size (vCPU and memory) for each InstanceGroup."
+  #
+  #   ref: https://medium.com/riskified-technology/run-kubernetes-on-aws-ec2-spot-instances-with-zero-downtime-f7327a95dea
+  #
+  # Note: use of YAML merge below (<<) would be great, but it is not supported
+  #       and was just part of YAML 1.1 but not 1.0 or 1.2.
+  #
+  - name: worker-a-4
+    availabilityZones: &worker-availabilityZones [us-west-2d, us-west-2b, us-west-2a]
+    minSize: &worker-minSize 0
+    maxSize: &worker-maxSize 8
+    desiredCapacity: &worker-desiredCapacity 0
+    volumeSize: &worker-volumeSize 80
+    labels:
+      k8s.dask.org/node-purpose: worker
+      2i2c.org/node-cpu: "4"
+    taints:
+      k8s.dask.org_dedicated: worker:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: worker
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "4"
+      k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org_dedicated: worker:NoSchedule
+    iam: &worker-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+    # Spot instance specific configuration
+    instancesDistribution:
+      instanceTypes:
+        - m5a.xlarge      # 57 pods, 4 cpu, 16 GB (AMD,   10 GBits network,  100% cost)
+        - m5.xlarge       # 57 pods, 4 cpu, 16 GB (Intel, 10 GBits network, ~112% cost)
+        # - m5n.xlarge    # 57 pods, 4 cpu, 16 GB (Intel, 25 GBits network, ~139% cost)
+      onDemandBaseCapacity: &worker-onDemandBaseCapacity 0
+      onDemandPercentageAboveBaseCapacity: &worker-onDemandPercentageAboveBaseCapacity 0
+      spotAllocationStrategy: &worker-spotAllocationStrategy capacity-optimized
+
+  - name: worker-a-16
+    availabilityZones: *worker-availabilityZones
+    minSize: *worker-minSize
+    maxSize: *worker-maxSize
+    desiredCapacity: *worker-desiredCapacity
+    volumeSize: *worker-volumeSize
+    labels:
+      k8s.dask.org/node-purpose: worker
+      2i2c.org/node-cpu: "16"
+    taints:
+      k8s.dask.org_dedicated: worker:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: worker
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "16"
+      k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org_dedicated: worker:NoSchedule
+    iam: *worker-iam
+    instancesDistribution:
+      instanceTypes:
+        - m5a.4xlarge     # 233 pods, 16 cpu, 64 GB (AMD,   10 GBits network,  100% cost)
+        - m5.4xlarge      # 233 pods, 16 cpu, 64 GB (Intel, 10 GBits network, ~112% cost)
+        # - m5n.4xlarge   # 233 pods, 16 cpu, 64 GB (Intel, 25 GBits network, ~139% cost)
+      onDemandBaseCapacity: *worker-onDemandBaseCapacity
+      onDemandPercentageAboveBaseCapacity: *worker-onDemandPercentageAboveBaseCapacity
+      spotAllocationStrategy: *worker-spotAllocationStrategy
+
+  - name: worker-a-64
+    availabilityZones: *worker-availabilityZones
+    minSize: *worker-minSize
+    maxSize: *worker-maxSize
+    desiredCapacity: *worker-desiredCapacity
+    volumeSize: *worker-volumeSize
+    labels:
+      k8s.dask.org/node-purpose: worker
+      2i2c.org/node-cpu: "64"
+    taints:
+      k8s.dask.org_dedicated: worker:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose: worker
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "64"
+      k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org_dedicated: worker:NoSchedule
+    iam: *worker-iam
+    instancesDistribution:
+      instanceTypes:
+        - m5a.16xlarge     # 736 pods, 64 cpu, 256 GB (AMD,   12 GBits network,  100% cost)
+        - m5.16xlarge      # 736 pods, 64 cpu, 256 GB (Intel, 20 GBits network, ~112% cost)
+        # - m5n.16xlarge   # 736 pods, 64 cpu, 256 GB (Intel, 75 GBits network, ~139% cost)
+      onDemandBaseCapacity: *worker-onDemandBaseCapacity
+      onDemandPercentageAboveBaseCapacity: *worker-onDemandPercentageAboveBaseCapacity
+      spotAllocationStrategy: *worker-spotAllocationStrategy

From 5dca036e289c6c45fb96bf6d9798a3d48da20037 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 2 Jun 2021 01:15:52 +0200
Subject: [PATCH 04/43] jmte: add volume hack to chown /shared folder

---
 shared/deployer/jmte.cluster.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 36efe2bc81..cf06a31ecd 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -86,13 +86,16 @@ hubs:
               # Need to explicitly fix ownership here, since EFS doesn't do anonuid
               - name: volume-mount-ownership-fix
                 image: busybox
-                command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan && ls -lhd /home/jovyan"]
+                command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan"]
                 securityContext:
                   runAsUser: 0
                 volumeMounts:
                   - name: home
                     mountPath: /home/jovyan
                     subPath: "{username}"
+                  - name: home
+                    mountPath: /home/jovyan/shared
+                    subPath: _shared
 
             image:
               name: pangeo/pangeo-notebook

From 9f912fba49a0f4f49ee672048b6631c5f79990b6 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 2 Jun 2021 02:10:14 +0200
Subject: [PATCH 05/43] jmte: add choldgraf

---
 shared/deployer/jmte.cluster.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index cf06a31ecd..01524e5d1d 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -155,6 +155,7 @@ hubs:
                   - abbyazari       # Abby Azari
                   - andersy005      # Anderson Banihirwe
                   - consideratio    # Erik Sundell
+                  - choldgraf       # Chris Holdgraf
                   - elliesch        # Ellie Abrahams
                   - EMscience       # Edom Moges
                   - espg            # Shane Grigsby

From f68460a988c1a318d335937ac833d2d567fbb50f Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 2 Jun 2021 02:10:32 +0200
Subject: [PATCH 06/43] jmte: opt out of default tolerations

---
 shared/deployer/jmte.cluster.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 01524e5d1d..c6d7863e90 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -197,6 +197,7 @@ hubs:
               extraPodConfig:
                 nodeSelector:
                   hub.jupyter.org/node-purpose: user
+                  k8s.dask.org/node-purpose: null
                 serviceAccountName: *user-sa
             worker:
               extraPodConfig:

From 10fee3873c170e6221a00635dbca8c77b2d46e7f Mon Sep 17 00:00:00 2001
From: Fernando Perez <fperez.net@gmail.com>
Date: Wed, 2 Jun 2021 13:36:19 -0700
Subject: [PATCH 07/43] jmte: add @jonathan-taylor as allowed user

---
 shared/deployer/jmte.cluster.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index c6d7863e90..1b5d49736c 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -167,6 +167,7 @@ hubs:
                   - tsnow03         # Tasha Snow
                   - whyjz           # Whyjay Zheng
                   - yuvipanda       # Yuvi Panda
+                  - jonathan-taylor # Jonathan Taylor
                 admin_users: *users
             allowNamedServers: true
             networkPolicy:

From 51e3815e1286285cf62ffd0f6311ed1012cd0893 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sat, 10 Jul 2021 23:01:03 +0200
Subject: [PATCH 08/43] jmte: set default profile list option and add
 descriptions

---
 shared/deployer/jmte.cluster.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 1b5d49736c..cdb8310ac8 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -103,29 +103,35 @@ hubs:
 
             profileList:
               - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
+                default: True
+                description: "A shared machine, the recommended option until you experience a limitation."
                 kubespawner_override:
                   cpu_guarantee: 0.225
                   mem_guarantee: 0.875G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
               - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
+                description: "A shared machine."
                 kubespawner_override:
                   cpu_guarantee: 0.875
                   mem_guarantee: 3.5G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
               - display_name: "Medium: 4 CPU, 16 GB"
+                description: "A dedicated machine for you."
                 kubespawner_override:
                   cpu_guarantee: 3.5
                   mem_guarantee: 14G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
               - display_name: "Large: 16 CPU, 64 GB"
+                description: "A dedicated machine for you."
                 kubespawner_override:
                   mem_guarantee: 56G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "16" }
               - display_name: "Massive: 64 CPU, 256 GB"
+                description: "A dedicated machine for you."
                 kubespawner_override:
                   mem_guarantee: 224G
                   mem_limit: null

From ea69e42b900070dbfd2473e44f8356a4e63b707d Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Jul 2021 22:03:56 +0200
Subject: [PATCH 09/43] jmte: add jupyterhub-ssh

---
 helm-charts/daskhub/Chart.yaml         |  3 ++
 helm-charts/daskhub/values.schema.yaml |  6 +++
 shared/deployer/jmte.cluster.yaml      | 75 +++++++++++++++++++++++++-
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/helm-charts/daskhub/Chart.yaml b/helm-charts/daskhub/Chart.yaml
index fd104fbdbc..e3ab544756 100644
--- a/helm-charts/daskhub/Chart.yaml
+++ b/helm-charts/daskhub/Chart.yaml
@@ -13,3 +13,6 @@ dependencies:
   - name: dask-gateway
     version: "2022.10.0"
     repository: "https://helm.dask.org/"
+  - name: jupyterhub-ssh
+    version: 0.0.1-n114.h3c48a9f
+    repository: https://yuvipanda.github.io/jupyterhub-ssh/
diff --git a/helm-charts/daskhub/values.schema.yaml b/helm-charts/daskhub/values.schema.yaml
index ccf3cd201d..228289adc1 100644
--- a/helm-charts/daskhub/values.schema.yaml
+++ b/helm-charts/daskhub/values.schema.yaml
@@ -27,6 +27,12 @@ properties:
   dask-gateway:
     type: object
     additionalProperties: true
+  # jupyterhub-ssh is a dependent helm chart, we rely on its schema validation
+  # for values passed to it and are not imposing restrictions on them in this
+  # helm chart.
+  jupyterhub-ssh:
+    type: object
+    additionalProperties: true
   global:
     type: object
     additionalProperties: true
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index cdb8310ac8..8fb271e32e 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -146,14 +146,75 @@ hubs:
             #   Along with this, we also make use of the autohttps system that
             #   requires us to configure an letsencrypt email.
             #
-            service:
-              type: LoadBalancer
             https:
               enabled: true
               type: letsencrypt
               letsencrypt:
                 contactEmail: erik@sundellopensource.se
 
+            service:
+              # jupyterhub-ssh/sftp integration part 1/3:
+              #
+              # We must accept traffic to the k8s Service (proxy-public) receiving traffic
+              # from the internet. Port 22 is typically used for both SSH and SFTP, but we
+              # can't use the same port for both so we use 2222 for SFTP in this example.
+              #
+              extraPorts:
+                - name: ssh
+                  port: 22
+                  targetPort: ssh
+                - name: sftp
+                  port: 2222
+                  targetPort: sftp
+            traefik:
+              # jupyterhub-ssh/sftp integration part 2/3:
+              #
+              # We must accept traffic arriving to the autohttps pod (traefik) from the
+              # proxy-public service. Expose a port and update the NetworkPolicy
+              # to tolerate incoming (ingress) traffic on the exposed port.
+              #
+              extraPorts:
+                - name: ssh
+                  containerPort: 8022
+                - name: sftp
+                  containerPort: 2222
+              networkPolicy:
+                allowedIngressPorts: [http, https, ssh, sftp]
+              # jupyterhub-ssh/sftp integration part 3/3:
+              #
+              # We must let traefik know it should listen for traffic (traefik entrypoint)
+              # and route it (traefik router) onwards to the jupyterhub-ssh k8s Service
+              # (traefik service).
+              #
+              extraStaticConfig:
+                entryPoints:
+                  ssh-entrypoint:
+                    address: :8022
+                  sftp-entrypoint:
+                    address: :2222
+              extraDynamicConfig:
+                tcp:
+                  services:
+                    ssh-service:
+                      loadBalancer:
+                        servers:
+                          - address: jupyterhub-ssh:22
+                    sftp-service:
+                      loadBalancer:
+                        servers:
+                          - address: jupyterhub-sftp:22
+                  routers:
+                    ssh-router:
+                      entrypoints: [ssh-entrypoint]
+                      rule: HostSNI(`*`)
+                      service: ssh-service
+                    sftp-router:
+                      entrypoints: [sftp-entrypoint]
+                      rule: HostSNI(`*`)
+                      service: sftp-service
+
+
+
           hub:
             config:
               Authenticator:
@@ -220,3 +281,13 @@ hubs:
               # per Dask cluster limits.
               c.ClusterConfig.cluster_max_cores = 256
               c.ClusterConfig.cluster_max_memory = "1028G"
+
+
+      jupyterhub-ssh:
+        hubUrl: http://proxy-http:8000
+
+        ssh:
+          enabled: true
+
+        sftp:
+          enabled: false

From 291d12d4aa2d7ba93612558778c804cfe7149956 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 11 Aug 2021 23:58:38 +0200
Subject: [PATCH 10/43] jmte: enable jupyterhub-sftp

---
 helm-charts/daskhub/Chart.yaml    |  2 +-
 shared/deployer/jmte.cluster.yaml | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/helm-charts/daskhub/Chart.yaml b/helm-charts/daskhub/Chart.yaml
index e3ab544756..df30291825 100644
--- a/helm-charts/daskhub/Chart.yaml
+++ b/helm-charts/daskhub/Chart.yaml
@@ -14,5 +14,5 @@ dependencies:
     version: "2022.10.0"
     repository: "https://helm.dask.org/"
   - name: jupyterhub-ssh
-    version: 0.0.1-n114.h3c48a9f
+    version: 0.0.1-n142.h402a3d6
     repository: https://yuvipanda.github.io/jupyterhub-ssh/
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 8fb271e32e..8e52330f6c 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -153,6 +153,9 @@ hubs:
                 contactEmail: erik@sundellopensource.se
 
             service:
+              # Revert an unwanted basehub default
+              type: LoadBalancer
+
               # jupyterhub-ssh/sftp integration part 1/3:
               #
               # We must accept traffic to the k8s Service (proxy-public) receiving traffic
@@ -282,7 +285,9 @@ hubs:
               c.ClusterConfig.cluster_max_cores = 256
               c.ClusterConfig.cluster_max_memory = "1028G"
 
-
+      # jupyterhub-ssh values.yaml reference:
+      # https://github.com/yuvipanda/jupyterhub-ssh/blob/main/helm-chart/jupyterhub-ssh/values.yaml
+      #
       jupyterhub-ssh:
         hubUrl: http://proxy-http:8000
 
@@ -290,4 +295,7 @@ hubs:
           enabled: true
 
         sftp:
-          enabled: false
+          enabled: true
+          pvc:
+            enabled: true
+            name: home-nfs

From b19b6ac91c6ffd6a747462fe3d8dbe82b523a569 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 3 Sep 2021 04:06:15 +0200
Subject: [PATCH 11/43] jmte: add GPUs

---
 eksctl/eksctl-cluster-config.yaml | 81 +++++++++++++++++++++++++++++++
 shared/deployer/jmte.cluster.yaml | 24 +++++++++
 2 files changed, 105 insertions(+)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 787c9e9fd7..04a4ca00a8 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -90,6 +90,9 @@ iam:
 # - Maximum pods: https://github.com/awslabs/amazon-eks-ami/blob/master/files/eni-max-pods.txt
 # - Node specs:   https://aws.amazon.com/ec2/instance-types/
 # - Cost:         https://ec2pricing.net/
+# - Instance availability in zone:
+#   - https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-discovery.html
+#   - aws ec2 describe-instance-type-offerings --location-type "availability-zone" --filters Name=location,Values=us-west-2d --region us-west-2 | grep g4dn
 #
 # Management advice:
 # - Always use a suffix for node group names that you can replace with something
@@ -97,6 +100,14 @@ iam:
 #   you name it "core" and "core-a" instead of "core-a" and "core-b", such as
 #   when deleting "core" you end up draining both node groups.
 #
+# Common gotcha:
+# - AWS quotas may stop you from scaling up. The symptoms for this will be that
+#   you observe that a scale up request has been made by the cluster-autoscaler
+#   but no new node ever comes online. If that happens, you should visit
+#   https://<your-region-here>.console.aws.amazon.com/ec2autoscaling/home, click
+#   on the auto scaling group (ASG), then go to the activity tab and verify that
+#   you have run into a quota issue. Following that, you make a request to AWS using provided link: https://aws.amazon.com/contact-us/ec2-request
+#
 nodeGroups:
   - name: core-a
     availabilityZones: [us-west-2d]   # aws ec2 describe-availability-zones --region <region-name>
@@ -174,6 +185,76 @@ nodeGroups:
       k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
     iam: *user-iam
 
+  # GPU Nodes.
+  #
+  # g4dn was chosen based on input from Shane in this comment
+  # https://github.com/pangeo-data/jupyter-earth/issues/77#issuecomment-910864707.
+  #
+  # For reference of the available choices, see
+  # https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing.
+  #
+  # For reference on the GPU device plugin that needs to be installed, but is
+  # installed automatically by eksctl, see:
+  # https://eksctl.io/usage/gpu-support/#gpu-support
+  #
+  # The machine nodes AMI (what is installed when it starts) for GPU nodes may
+  # require you to subscribe to the AMI and accept some license. For more info,
+  # see:
+  # https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-eks-setup.html#deep-learning-containers-eks-setup-licensing
+  #
+  # Note that we opted for us-west-2b here because g4dn machines were not
+  # available in us-west-2d.
+  #
+  # 57 pods, 4 cpu, 16 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
+  - name: user-gpu-a-4
+    availabilityZones: &user-gpu-availabilityZones [us-west-2b]
+    instanceType: g4dn.xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "4"
+      2i2c.org/node-gpu: "1"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "4"
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+      k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: NoSchedule
+    iam: &user-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
+  # 233 pods, 16 cpu, 64 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
+  - name: user-gpu-a-16
+    availabilityZones: *user-gpu-availabilityZones
+    instanceType: g4dn.4xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "16"
+      2i2c.org/node-gpu: "1"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "16"
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+      k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: NoSchedule
+    iam: &user-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
 
 
   # Worker node pools using cheaper spot instances that are temporary.
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 8e52330f6c..69ecd09591 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -110,6 +110,7 @@ hubs:
                   mem_guarantee: 0.875G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
+                  extra_resource_limits: {}
               - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
                 description: "A shared machine."
                 kubespawner_override:
@@ -117,6 +118,7 @@ hubs:
                   mem_guarantee: 3.5G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
+                  extra_resource_limits: {}
               - display_name: "Medium: 4 CPU, 16 GB"
                 description: "A dedicated machine for you."
                 kubespawner_override:
@@ -124,18 +126,38 @@ hubs:
                   mem_guarantee: 14G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "4" }
+                  extra_resource_limits: {}
               - display_name: "Large: 16 CPU, 64 GB"
                 description: "A dedicated machine for you."
                 kubespawner_override:
                   mem_guarantee: 56G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "16" }
+                  extra_resource_limits: {}
               - display_name: "Massive: 64 CPU, 256 GB"
                 description: "A dedicated machine for you."
                 kubespawner_override:
                   mem_guarantee: 224G
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "64" }
+                  extra_resource_limits: {}
+              - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
+                description: "A dedicated machine for you with one GPU attached."
+                kubespawner_override:
+                  cpu_guarantee: 3.5
+                  mem_guarantee: 14G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "4", 2i2c.org/node-gpu: "1" }
+                  extra_resource_limits:
+                    nvidia.com/gpu: "1"
+              - display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
+                description: "A dedicated machine for you with one GPU attached."
+                kubespawner_override:
+                  mem_guarantee: 56G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-cpu: "16", 2i2c.org/node-gpu: "1" }
+                  extra_resource_limits:
+                    nvidia.com/gpu: "1"
 
           proxy:
             # proxy notes:
@@ -155,6 +177,8 @@ hubs:
             service:
               # Revert an unwanted basehub default
               type: LoadBalancer
+              annotations:
+                service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
 
               # jupyterhub-ssh/sftp integration part 1/3:
               #

From 28f30068e8eb55374225a934fe1f288baf4e4077 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 1 Oct 2021 16:36:11 +0200
Subject: [PATCH 12/43] jmte: increase start timeout to handle edge cases

---
 shared/deployer/jmte.cluster.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 69ecd09591..df34a6e4be 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -70,6 +70,11 @@ hubs:
               - --LabApp.collaborative=True
               - --ServerApp.allow_remote_access=True
 
+            # Increased as we have experienced a too slow image pull at least
+            # once. Our pods can take ~6-7 minutes to start on a new node it
+            # seems, so this gives us some margin.
+            startTimeout: 900
+
             extraEnv:
               # SCRATCH_BUCKET / PANGEO_SCRATCH are environment variables that
               # help users write notebooks and such referencing this environment

From b388c0489c25a4d176c0975589500d2137762e07 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 1 Oct 2021 16:36:34 +0200
Subject: [PATCH 13/43] jmte: increase user and worker node's disk volumes

---
 eksctl/eksctl-cluster-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 04a4ca00a8..f2b111c09f 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -115,7 +115,7 @@ nodeGroups:
     minSize: 0
     maxSize: 2
     desiredCapacity: 1
-    volumeSize: 80
+    volumeSize: 250
     labels:
       hub.jupyter.org/node-purpose: core
     tags:
@@ -132,7 +132,7 @@ nodeGroups:
     minSize: &user-minSize 0
     maxSize: &user-maxSize 4
     desiredCapacity: &user-desiredCapacity 0
-    volumeSize: &user-volumeSize 80
+    volumeSize: &user-volumeSize 500
     labels:
       hub.jupyter.org/node-purpose: user
       2i2c.org/node-cpu: "4"
@@ -280,7 +280,7 @@ nodeGroups:
     minSize: &worker-minSize 0
     maxSize: &worker-maxSize 8
     desiredCapacity: &worker-desiredCapacity 0
-    volumeSize: &worker-volumeSize 80
+    volumeSize: &worker-volumeSize 500
     labels:
       k8s.dask.org/node-purpose: worker
       2i2c.org/node-cpu: "4"

From cd3e206e6d3765fdf0340629846b71482dfe8e9e Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Thu, 14 Oct 2021 04:30:22 +0200
Subject: [PATCH 14/43] jmte: configure x1.16xlarge nodes

---
 eksctl/eksctl-cluster-config.yaml | 43 +++++++++++++++++++++++++++++++
 shared/deployer/jmte.cluster.yaml |  7 +++++
 2 files changed, 50 insertions(+)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index f2b111c09f..9b74aec009 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -185,6 +185,49 @@ nodeGroups:
       k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
     iam: *user-iam
 
+  # High memory nodes.
+  #
+  # The local SSD storage available on these high memory nodes is not exposed by
+  # default in some easy way but is rather quite tricky to make use of in k8s.
+  # To make that happen, one needs to have a daemonset installed to prepare the
+  # nodes that has local storage to make it exposed.
+  #
+  # A discussion on how this is done is made in
+  # https://github.com/pangeo-data/jupyter-earth/issues/88.
+  #
+  # To figure out what availability zones we could use, I used the command below
+  # and took the union of that output with the zones of the EKS control plane
+  # configured in the root level of this config. I'm not sure if I could use
+  # nodes in other availability zones.
+  #
+  #   aws ec2 describe-instance-type-offerings \
+  #     --region us-west-2 \
+  #     --filter Name=instance-type,Values=x1.16xlarge \
+  #     --location-type=availability-zone
+  #
+  # 233 pods, 64 cpu, 976 GB, 1,920 GB local SSD storage, (Intel, 10 GBits
+  # network)
+  - name: user-highmem-a-64
+    availabilityZones: &user-highmem-availabilityZones [us-west-2b, us-west-2a]
+    instanceType: x1.16xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-highmem-cpu: "64"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-highmem-cpu: "64"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+    iam: &user-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
   # GPU Nodes.
   #
   # g4dn was chosen based on input from Shane in this comment
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index df34a6e4be..313da55c7a 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -146,6 +146,13 @@ hubs:
                   mem_limit: null
                   node_selector: { 2i2c.org/node-cpu: "64" }
                   extra_resource_limits: {}
+              - display_name: "Massive high-memory: 64 CPU, 976 GB"
+                description: "A dedicated machine for you."
+                kubespawner_override:
+                  mem_guarantee: 900G
+                  mem_limit: null
+                  node_selector: { 2i2c.org/node-highmem-cpu: "64" }
+                  extra_resource_limits: {}
               - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
                 description: "A dedicated machine for you with one GPU attached."
                 kubespawner_override:

From 8d7ac1f13a6525b7e89dced7697db2cf76afcafd Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Thu, 28 Oct 2021 19:51:36 +0200
Subject: [PATCH 15/43] jmte: tweak configuration related to GPU

---
 debug-pod.yaml                    | 93 +++++++++++++++++++++++++++++++
 eksctl/eksctl-cluster-config.yaml | 37 ++++++++++--
 shared/deployer/jmte.cluster.yaml | 26 +++++++++
 3 files changed, 151 insertions(+), 5 deletions(-)
 create mode 100644 debug-pod.yaml

diff --git a/debug-pod.yaml b/debug-pod.yaml
new file mode 100644
index 0000000000..b456001eaa
--- /dev/null
+++ b/debug-pod.yaml
@@ -0,0 +1,93 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  annotations:
+    hub.jupyter.org/username: fperez
+  labels:
+    app: jupyterhub
+    chart: jupyterhub-1.1.1
+    component: singleuser-server
+    heritage: jupyterhub
+    hub.jupyter.org/network-access-hub: "true"
+    hub.jupyter.org/network-access-proxy-http: "true"
+    hub.jupyter.org/servername: ""
+    hub.jupyter.org/username: fperez
+    release: prod
+  name: jupyter-fperez-debugging
+  namespace: prod
+spec:
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+      - preference:
+          matchExpressions:
+          - key: hub.jupyter.org/node-purpose
+            operator: In
+            values:
+            - user
+        weight: 100
+  containers:
+  - args:
+    - jupyterhub-singleuser
+    - --ip=0.0.0.0
+    - --port=8888
+    - --SingleUserNotebookApp.default_url=/lab
+    image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:c6d9558
+    name: notebook
+    ports:
+    - containerPort: 8888
+      name: notebook-port
+      protocol: TCP
+    resources:
+      requests:
+        cpu: 225m
+        memory: "939524096"
+  initContainers:
+  - command:
+    - sh
+    - -c
+    - id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan & sleep infinity
+    image: busybox
+    imagePullPolicy: Always
+    name: volume-mount-ownership-fix
+    securityContext:
+      runAsUser: 0
+    volumeMounts:
+    - mountPath: /home/jovyan
+      name: home
+      subPath: fperez
+    - mountPath: /home/jovyan/shared
+      name: home
+      subPath: _shared
+  nodeSelector:
+    2i2c.org/node-cpu: "4"
+  priority: 0
+  priorityClassName: prod-default-priority
+  restartPolicy: OnFailure
+  schedulerName: prod-user-scheduler
+  securityContext:
+    fsGroup: 100
+  serviceAccount: s3-full-access
+  serviceAccountName: s3-full-access
+  terminationGracePeriodSeconds: 30
+  tolerations:
+  - effect: NoSchedule
+    key: hub.jupyter.org/dedicated
+    operator: Equal
+    value: user
+  - effect: NoSchedule
+    key: hub.jupyter.org_dedicated
+    operator: Equal
+    value: user
+  - effect: NoExecute
+    key: node.kubernetes.io/not-ready
+    operator: Exists
+    tolerationSeconds: 300
+  - effect: NoExecute
+    key: node.kubernetes.io/unreachable
+    operator: Exists
+    tolerationSeconds: 300
+  volumes:
+  - name: home
+    persistentVolumeClaim:
+      claimName: home-nfs
\ No newline at end of file
diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 9b74aec009..b97ddee8be 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -15,9 +15,10 @@
 #   eksctl get nodegroups --cluster jmte
 #
 #   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --approve
-#   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*"
-#   eksctl delete nodegroup --cluster jmte --name core-a
-#   eksctl create nodegroup --cluster jmte --name core-a
+#   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --install-nvidia-plugin=false
+#   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-gpu-a-*" --install-nvidia-plugin=false
+#   eksctl delete nodegroup --cluster jmte --name core-a --approve
+#   eksctl create nodegroup --cluster jmte --name core-a --install-nvidia-plugin=false
 #
 #   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --approve && eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*"
 #
@@ -238,7 +239,31 @@ nodeGroups:
   #
   # For reference on the GPU device plugin that needs to be installed, but is
   # installed automatically by eksctl, see:
-  # https://eksctl.io/usage/gpu-support/#gpu-support
+  # https://eksctl.io/usage/gpu-support/#gpu-support. With that said, the
+  # daemonset must still have a toleration set manually on it.
+  #
+  # Do a `kubectl edit ds -n kube-system nvidia-device-plugin-daemonset` and add
+  # the following entries under tolerations:
+  #
+  # - effect: NoSchedule
+  #   key: hub.jupyter.org/dedicated
+  #   operator: Equal
+  #   value: user
+  # - effect: NoSchedule
+  #   key: hub.jupyter.org_dedicated
+  #   operator: Equal
+  #   value: user
+  #
+  # Sadly, something is making this change reset. I don't know why, but I
+  # suspect it happens whenever I do something with eksctl - perhaps whenever I
+  # do something with the nodegroup realted to GPU nodes. I think it resets
+  # whenever a GPU based nodegroup is created unless
+  # --install-nvidia-plugin=false it passed to the `eksctl create nodegroup`
+  # command.
+  #
+  # It seems I may need to specify additional tags also, with associated value
+  # for the GPU of choice:
+  # https://github.com/kubernetes/autoscaler/blob/e80ab518340f88f364fe3ef063f8303755125971/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go#L40-L47
   #
   # The machine nodes AMI (what is installed when it starts) for GPU nodes may
   # require you to subscribe to the AMI and accept some license. For more info,
@@ -260,9 +285,12 @@ nodeGroups:
       hub.jupyter.org/node-purpose: user
       2i2c.org/node-cpu: "4"
       2i2c.org/node-gpu: "1"
+      k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
     taints:
       hub.jupyter.org_dedicated: user:NoSchedule
+      nvidia.com/gpu: NoSchedule
     tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
       k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "4"
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
@@ -292,7 +320,6 @@ nodeGroups:
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "16"
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
       k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
-      k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: NoSchedule
     iam: &user-iam
       withAddonPolicies:
         autoScaler: true
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 313da55c7a..b1609f4e81 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -53,6 +53,16 @@ hubs:
                   url: https://jupytearth.org
 
           singleuser:
+            # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
+            #
+            # Example:
+            #
+            # extraFiles:
+            #   bash-extras:
+            #     mountPath: /etc/test.txt
+            #     stringData: |
+            #       hello world!
+
             # Eksctl: The service account was created by eksctl.
             #
             serviceAccountName: &user-sa s3-full-access
@@ -258,6 +268,9 @@ hubs:
             config:
               Authenticator:
                 allowed_users: &users
+                  # This is just listing a few of the users/admins, a lot of
+                  # users has been added manually, see:
+                  # https://github.com/pangeo-data/jupyter-earth/issues/53
                   - abbyazari       # Abby Azari
                   - andersy005      # Anderson Banihirwe
                   - consideratio    # Erik Sundell
@@ -301,6 +314,19 @@ hubs:
           nodeSelector: null
           backend:
             scheduler:
+              # IMPORTANT: We have experienced that the scheduler can fail with
+              #            1GB memory limit. This was observed "stream closed"
+              #            from the python client working against the
+              #            Dask-Gateway created DaskCluster.
+              #
+              #            CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
+              #
+              cores:
+                request: 1
+                limit: 64
+              memory:
+                request: 2G
+                limit: 500G
               extraPodConfig:
                 nodeSelector:
                   hub.jupyter.org/node-purpose: user

From 351adc500043b5f465f3b4659575bd125b106d8a Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 3 Sep 2021 04:05:19 +0200
Subject: [PATCH 16/43] jmte: disable JupyterLab collaborative mode, awaiting
 critical bugfixes

---
 shared/deployer/jmte.cluster.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index b1609f4e81..4156e4bb2d 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -67,18 +67,18 @@ hubs:
             #
             serviceAccountName: &user-sa s3-full-access
 
-            # cmd: I've experimented with these settings to get a JupyterLab RTC
-            #      setup functioning. It currently is, but is this what makes
-            #      sense to get it to function?
-            #
-            #      ref: https://github.com/jupyterlab-contrib/jupyterlab-link-share/issues/10#issuecomment-851899758
-            #      ref: https://github.com/jupyterlab/jupyterlab/blob/1c8ff104a99e294265e6cf476dcb46279b0c3593/binder/jupyter_notebook_config.py#L39
-            #
-            #      Note the default in z2jh is jupyterhub-singleuser.
+            # cmd: Note the default in z2jh is jupyterhub-singleuser.
             cmd:
               - jupyterhub-singleuser
-              - --LabApp.collaborative=True
-              - --ServerApp.allow_remote_access=True
+              # FIXME: Collaborative mode is disabled due to critical issues
+              #        reported in
+              #        https://discourse.jupyter.org/t/plans-on-bringing-rtc-to-jupyterhub/9813/13
+              #        seem to remain according to Tasha Snow.
+              #
+              #        These issues may be resolved by
+              #        https://github.com/jupyterlab/jupyterlab/pull/11599.
+              #
+              # - --LabApp.collaborative=True
 
             # Increased as we have experienced a too slow image pull at least
             # once. Our pods can take ~6-7 minutes to start on a new node it

From f7f749b70a16926d8f0a0ce7837408f18da4f4f2 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 26 Jan 2022 01:44:52 +0100
Subject: [PATCH 17/43] jmte: adjust to basehub values refactoring

---
 shared/deployer/jmte.cluster.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 4156e4bb2d..95397b83d8 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -13,9 +13,12 @@ hubs:
       basehub:
         # Cloudformation: The EFS filesystem was created by cloudformation.
         #
-        nfsPVC:
+        nfs:
           enabled: true
-          nfs:
+          shareCreator:
+            enabled: true
+          pv:
+            serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
             # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
             mountOptions:
               - rsize=1048576
@@ -24,7 +27,6 @@ hubs:
               - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
               - retrans=2
               - noresvport
-            serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
             # baseShareName is required to be just "/" so that we can create
             # various sub folders in the filesystem that our PV to access the
             # NFS server can reference successfully as it isn't supported to

From 314d146957e567093597b4b07351bc2bc32a0d05 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Wed, 26 Jan 2022 01:45:24 +0100
Subject: [PATCH 18/43] jmte: github-app-auth-user: add related gitconfig and
 env vars

---
 shared/deployer/jmte.cluster.yaml | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 95397b83d8..0a121ebc33 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -56,14 +56,22 @@ hubs:
 
           singleuser:
             # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
-            #
-            # Example:
-            #
-            # extraFiles:
-            #   bash-extras:
-            #     mountPath: /etc/test.txt
-            #     stringData: |
-            #       hello world!
+            extraFiles:
+              # github-app-user-auth requires:
+              # - Installed python package
+              # - GITHUB_APP_CLIENT_ID environment set
+              # - This configuration
+              #
+              # NOTE: an associated GitHub App has been created by Erik Sundell
+              #       aka. @consideRatio and can be configured by him at:
+              #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
+              #
+              github-app-user-auth:
+                mountPath: /etc/gitconfig
+                stringData: |
+                  [credential]
+                      helper = store --file=/tmp/github-app-git-credentials
+
 
             # Eksctl: The service account was created by eksctl.
             #
@@ -98,7 +106,11 @@ hubs:
               #
               SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER)
               PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER)
-
+              # GITHUB_APP_CLIENT_ID, see notes in singleuser.extraFiles about
+              # this environment variable. Two entries are created as I think
+              # the shorter may be deprecated soon.
+              GITHUB_APP_CLIENT_ID: Iv1.a073b1649637af12
+              GITHUB_APP_USER_AUTH_CLIENT_ID: Iv1.a073b1649637af12
             initContainers:
               # Need to explicitly fix ownership here, since EFS doesn't do anonuid
               - name: volume-mount-ownership-fix

From b439dbfe7310bacc084f2c4fca6df10ad32cb694 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Mon, 28 Feb 2022 11:04:51 +0100
Subject: [PATCH 19/43] jmte: enable possibility to show hidden files

---
 shared/deployer/jmte.cluster.yaml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
index 0a121ebc33..af7b541699 100644
--- a/shared/deployer/jmte.cluster.yaml
+++ b/shared/deployer/jmte.cluster.yaml
@@ -66,12 +66,18 @@ hubs:
               #       aka. @consideRatio and can be configured by him at:
               #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
               #
-              github-app-user-auth:
+              gitconfig:
                 mountPath: /etc/gitconfig
                 stringData: |
                   [credential]
                       helper = store --file=/tmp/github-app-git-credentials
-
+              jupyter_notebook_config.json:
+                mountPath: /etc/jupyter/jupyter_notebook_config.json
+                data:
+                  # Allow jupyterlab option to show hidden files in browser
+                  # https://github.com/berkeley-dsep-infra/datahub/issues/3160
+                  ContentsManager:
+                    allow_hidden: true
 
             # Eksctl: The service account was created by eksctl.
             #

From 93d1acd993834ecabdf7fa3ca8dfe8c934ebb970 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 18 Mar 2022 18:08:19 +0100
Subject: [PATCH 20/43] jmte: adjust to deployer script rework

---
 config/clusters/jmte/cluster.yaml        |  19 ++
 config/clusters/jmte/common.values.yaml  | 318 +++++++++++++++++++
 config/clusters/jmte/prod.values.yaml    |  90 ++++++
 config/clusters/jmte/staging.values.yaml |  29 ++
 debug-pod.yaml                           | 114 +++----
 deployer/cluster.py                      |   2 +-
 deployer/cluster.schema.yaml             |   2 +-
 eksctl/cloudformation-extras.yaml        | 121 +++++--
 eksctl/eksctl-cluster-config.yaml        |  29 +-
 shared/deployer/jmte.cluster.yaml        | 383 -----------------------
 10 files changed, 620 insertions(+), 487 deletions(-)
 create mode 100644 config/clusters/jmte/cluster.yaml
 create mode 100644 config/clusters/jmte/common.values.yaml
 create mode 100644 config/clusters/jmte/prod.values.yaml
 create mode 100644 config/clusters/jmte/staging.values.yaml
 delete mode 100644 shared/deployer/jmte.cluster.yaml

diff --git a/config/clusters/jmte/cluster.yaml b/config/clusters/jmte/cluster.yaml
new file mode 100644
index 0000000000..2b916d0f5b
--- /dev/null
+++ b/config/clusters/jmte/cluster.yaml
@@ -0,0 +1,19 @@
+name: jmte
+provider: none
+hubs:
+  - name: staging
+    domain: staging.hub.jupytearth.org
+    helm_chart: daskhub
+    auth0:
+      connection: github
+    helm_chart_values_files:
+      - common.values.yaml
+      - staging.values.yaml
+  - name: prod
+    domain: hub.jupytearth.org
+    helm_chart: daskhub
+    auth0:
+      connection: github
+    helm_chart_values_files:
+      - common.values.yaml
+      - prod.values.yaml
diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
new file mode 100644
index 0000000000..1aa0faf34f
--- /dev/null
+++ b/config/clusters/jmte/common.values.yaml
@@ -0,0 +1,318 @@
+basehub:
+  # Cloudformation: The EFS filesystem was created by cloudformation.
+  #
+  nfs:
+    # enabled is adjusted by staging/prod values
+    # enabled: true
+    shareCreator:
+      enabled: true
+    pv:
+      serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
+      # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
+      mountOptions:
+        - rsize=1048576
+        - wsize=1048576
+        - timeo=600
+        - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
+        - retrans=2
+        - noresvport
+      # baseShareName is required to be just "/" so that we can create
+      # various sub folders in the filesystem that our PV to access the
+      # NFS server can reference successfully as it isn't supported to
+      # access a not yet existing folder. This creation is automated by
+      # the nfs-share-creator resource part of the basehub Helm chart.
+      baseShareName: /
+
+  jupyterhub:
+    custom:
+      homepage:
+        templateVars:
+          org:
+            name: Jupyter meets the Earth
+            logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
+            url: https://jupytearth.org
+          designed_by:
+            name: 2i2c
+            url: https://2i2c.org
+          operated_by:
+            name: 2i2c
+            url: https://2i2c.org
+          funded_by:
+            name: Jupyter meets the Earth
+            url: https://jupytearth.org
+
+    scheduling:
+      userScheduler:
+        # Revert basehubs default that relies on GKE's built in scheduler that
+        # is optimized to pack pods into busy nodes. This is a AWS EKS based
+        # hub without such default scheduler.
+        enabled: true
+
+    singleuser:
+      # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
+      extraFiles:
+        # github-app-user-auth requires:
+        # - Installed python package
+        # - GITHUB_APP_CLIENT_ID environment set
+        # - This configuration
+        #
+        # NOTE: an associated GitHub App has been created by Erik Sundell
+        #       aka. @consideRatio and can be configured by him at:
+        #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
+        #
+        gitconfig:
+          mountPath: /etc/gitconfig
+          stringData: |
+            [credential]
+                helper = store --file=/tmp/github-app-git-credentials
+        jupyter_notebook_config.json:
+          mountPath: /etc/jupyter/jupyter_notebook_config.json
+          data:
+            # Allow jupyterlab option to show hidden files in browser
+            # https://github.com/berkeley-dsep-infra/datahub/issues/3160
+            ContentsManager:
+              allow_hidden: true
+
+      # Eksctl: The service account was created by eksctl.
+      #
+      # serviceAccountName is added to prod values
+      # serviceAccountName: &user-sa s3-full-access
+
+      # Increased as we have experienced a too slow image pull at least
+      # once. Our pods can take ~6-7 minutes to start on a new node it
+      # seems, so this gives us some margin.
+      startTimeout: 900
+
+      extraEnv:
+        # GITHUB_APP_CLIENT_ID, see notes in singleuser.extraFiles about
+        # this environment variable. Two entries are created as I think
+        # the shorter may be deprecated soon.
+        GITHUB_APP_CLIENT_ID: Iv1.a073b1649637af12
+        GITHUB_APP_USER_AUTH_CLIENT_ID: Iv1.a073b1649637af12
+
+      image:
+        # NOTE: We use the jupyterhub-configurator so this image/tag is not
+        #       relevant. Visit its UI to configure the hub.
+        #
+        #       staging: https://staging.hub.jupytearth.org/services/configurator/
+        #       prod:    https://hub.jupytearth.org/services/configurator/
+        pullPolicy: Always
+        name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env
+        tag: "latest"
+
+      profileList:
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
+          default: True
+          description: "A shared machine, the recommended option until you experience a limitation."
+          kubespawner_override:
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "4" }
+            extra_resource_limits: {}
+        - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
+          description: "A shared machine."
+          kubespawner_override:
+            cpu_guarantee: 0.875
+            mem_guarantee: 3.5G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "4" }
+            extra_resource_limits: {}
+        - display_name: "Medium: 4 CPU, 16 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "4" }
+            extra_resource_limits: {}
+        - display_name: "Large: 16 CPU, 64 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "16" }
+            extra_resource_limits: {}
+        - display_name: "Massive: 64 CPU, 256 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "64" }
+            extra_resource_limits: {}
+        - display_name: "Massive high-memory: 64 CPU, 976 GB"
+          description: "A dedicated machine for you."
+          kubespawner_override:
+            mem_guarantee: 900G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-highmem-cpu: "64" }
+            extra_resource_limits: {}
+        - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            cpu_guarantee: 3.5
+            mem_guarantee: 14G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "4", 2i2c.org/node-gpu: "1" }
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+        - display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 56G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "16", 2i2c.org/node-gpu: "1" }
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
+
+    proxy:
+      # proxy notes:
+      #
+      # - Revert basehubs overrides as we don't install ingress-nginx and
+      #   cert-manager yet, and therefore should use
+      #   service.type=LoadBalancer instead of service.type=ClusterIP.
+      #   Along with this, we also make use of the autohttps system that
+      #   requires us to configure an letsencrypt email.
+      #
+      https:
+        enabled: true
+        type: letsencrypt
+        letsencrypt:
+          contactEmail: erik@sundellopensource.se
+
+      service:
+        # Revert an unwanted basehub default
+        type: LoadBalancer
+        annotations:
+          service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
+
+        # jupyterhub-ssh/sftp integration part 1/3:
+        #
+        # We must accept traffic to the k8s Service (proxy-public) receiving traffic
+        # from the internet. Port 22 is typically used for both SSH and SFTP, but we
+        # can't use the same port for both so we use 2222 for SFTP in this example.
+        #
+        extraPorts:
+          - name: ssh
+            port: 22
+            targetPort: ssh
+          - name: sftp
+            port: 2222
+            targetPort: sftp
+      traefik:
+        # jupyterhub-ssh/sftp integration part 2/3:
+        #
+        # We must accept traffic arriving to the autohttps pod (traefik) from the
+        # proxy-public service. Expose a port and update the NetworkPolicy
+        # to tolerate incoming (ingress) traffic on the exposed port.
+        #
+        extraPorts:
+          - name: ssh
+            containerPort: 8022
+          - name: sftp
+            containerPort: 2222
+        networkPolicy:
+          allowedIngressPorts: [http, https, ssh, sftp]
+        # jupyterhub-ssh/sftp integration part 3/3:
+        #
+        # extraStaticConfig is adjusted by staging/prod values
+        # extraDynamicConfig is adjusted by staging/prod values
+
+    hub:
+      config:
+        Authenticator:
+          allowed_users: &users
+            # This is just listing a few of the users/admins, a lot of
+            # users has been added manually, see:
+            # https://github.com/pangeo-data/jupyter-earth/issues/53
+            - abbyazari # Abby Azari
+            - andersy005 # Anderson Banihirwe
+            - consideratio # Erik Sundell
+            - choldgraf # Chris Holdgraf
+            - elliesch # Ellie Abrahams
+            - EMscience # Edom Moges
+            - espg # Shane Grigsby
+            - facusapienza21 # Facundo Sapienza
+            - fperez # Fernando Pérez
+            - kmpaul # Kevin Paul
+            - lrennels # Lisa Rennels
+            - mrsiegfried # Matthew Siegfried
+            - tsnow03 # Tasha Snow
+            - whyjz # Whyjay Zheng
+            - yuvipanda # Yuvi Panda
+            - jonathan-taylor # Jonathan Taylor
+          admin_users: *users
+      allowNamedServers: true
+      networkPolicy:
+        # FIXME: Required for dask gateway 0.9.0. It is fixed but a Helm
+        #        chart of newer version is not yet released.
+        enabled: false
+
+dask-gateway:
+  # dask-gateway notes:
+  #
+  # - Explicitly unset daskhub's nodeSelectors for all pods except the
+  #   worker pods. The tolerations applied in the basehub config to all
+  #   non-worker pods in dask-gateway will provide a preferred affinity
+  #   towards suitable nodes without needing to have a label on them. Then
+  #   we use the node label "k8s.dask.org/node-purpose: worker"
+  #   specifically for enforce workers to schedule on such nodes.
+  #
+  traefik:
+    nodeSelector: null
+  controller:
+    nodeSelector: null
+  gateway:
+    nodeSelector: null
+    backend:
+      scheduler:
+        # IMPORTANT: We have experienced that the scheduler can fail with
+        #            1GB memory limit. This was observed "stream closed"
+        #            from the python client working against the
+        #            Dask-Gateway created DaskCluster.
+        #
+        #            CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
+        #
+        cores:
+          request: 1
+          limit: 64
+        memory:
+          request: 2G
+          limit: 500G
+        extraPodConfig:
+          nodeSelector:
+            hub.jupyter.org/node-purpose: user
+            k8s.dask.org/node-purpose: null
+          # serviceAccountName is adjusted by staging/prod values
+          # serviceAccountName: *user-sa
+      worker:
+        extraPodConfig:
+          nodeSelector:
+            k8s.dask.org/node-purpose: worker
+          # serviceAccountName is adjusted by staging/prod values
+          # serviceAccountName: *user-sa
+
+    extraConfig:
+      idle: |
+        # timeout after 30 minutes of inactivity
+        c.KubeClusterConfig.idle_timeout = 1800
+      limits: |
+        # per Dask cluster limits.
+        c.ClusterConfig.cluster_max_cores = 256
+        c.ClusterConfig.cluster_max_memory = "1028G"
+
+# jupyterhub-ssh values.yaml reference:
+# https://github.com/yuvipanda/jupyterhub-ssh/blob/main/helm-chart/jupyterhub-ssh/values.yaml
+#
+jupyterhub-ssh:
+  hubUrl: http://proxy-http:8000
+
+  ssh:
+    enabled: true
+
+  sftp:
+    # enabled is adjusted by staging/prod values
+    # enabled: true
+    pvc:
+      enabled: true
+      name: home-nfs
diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
new file mode 100644
index 0000000000..9ff43eca6a
--- /dev/null
+++ b/config/clusters/jmte/prod.values.yaml
@@ -0,0 +1,90 @@
+basehub:
+  nfs:
+    enabled: true
+
+  jupyterhub:
+    singleuser:
+      # Eksctl: The service account was created by eksctl.
+      #
+      serviceAccountName: &user-sa s3-full-access
+
+      extraEnv:
+        # SCRATCH_BUCKET / PANGEO_SCRATCH are environment variables that
+        # help users write notebooks and such referencing this environment
+        # variable in a way that will work between users.
+        #
+        # $(ENV_VAR) will by evaluated by k8s automatically
+        #
+        # Cloudformation: The s3 bucket was created by cloudformation.
+        #
+        SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER)
+        PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER)
+
+      initContainers:
+        # Need to explicitly fix ownership here, since EFS doesn't do anonuid
+        - name: volume-mount-ownership-fix
+          image: busybox
+          command:
+            [
+              "sh",
+              "-c",
+              "id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan",
+            ]
+          securityContext:
+            runAsUser: 0
+          volumeMounts:
+            - name: home
+              mountPath: /home/jovyan
+              subPath: "{username}"
+            - name: home
+              mountPath: /home/jovyan/shared
+              subPath: _shared
+
+    proxy:
+      traefik:
+        # jupyterhub-ssh/sftp integration part 3/3:
+        #
+        # We must let traefik know it should listen for traffic (traefik entrypoint)
+        # and route it (traefik router) onwards to the jupyterhub-ssh k8s Service
+        # (traefik service).
+        #
+        extraStaticConfig:
+          entryPoints:
+            ssh-entrypoint:
+              address: :8022
+            sftp-entrypoint:
+              address: :2222
+        extraDynamicConfig:
+          tcp:
+            services:
+              ssh-service:
+                loadBalancer:
+                  servers:
+                    - address: jupyterhub-ssh:22
+              sftp-service:
+                loadBalancer:
+                  servers:
+                    - address: jupyterhub-sftp:22
+            routers:
+              ssh-router:
+                entrypoints: [ssh-entrypoint]
+                rule: HostSNI(`*`)
+                service: ssh-service
+              sftp-router:
+                entrypoints: [sftp-entrypoint]
+                rule: HostSNI(`*`)
+                service: sftp-service
+
+dask-gateway:
+  gateway:
+    backend:
+      scheduler:
+        extraPodConfig:
+          serviceAccountName: *user-sa
+      worker:
+        extraPodConfig:
+          serviceAccountName: *user-sa
+
+jupyterhub-ssh:
+  sftp:
+    enabled: true
diff --git a/config/clusters/jmte/staging.values.yaml b/config/clusters/jmte/staging.values.yaml
new file mode 100644
index 0000000000..570bd8ebe2
--- /dev/null
+++ b/config/clusters/jmte/staging.values.yaml
@@ -0,0 +1,29 @@
+basehub:
+  nfs:
+    enabled: false
+
+  jupyterhub:
+    custom:
+      singleuserAdmin:
+        extraVolumeMounts: []
+
+    singleuser:
+      storage:
+        type: none
+        extraVolumeMounts: []
+
+      # cmd: Note the default in z2jh is jupyterhub-singleuser.
+      cmd:
+        - jupyterhub-singleuser
+        # WARNING: Collaborative mode is enabled in the staging hub specifically
+        #          to debug a critical issue leading to a loss of data.
+        #
+        #          ref: https://github.com/jupyterlab/jupyterlab/issues/12154#issuecomment-1069352840
+        #          ref: https://discourse.jupyter.org/t/plans-on-bringing-rtc-to-jupyterhub/9813/13
+        #          ref: https://github.com/jupyterlab/jupyterlab/pull/11599
+        #
+        - --LabApp.collaborative=True
+
+jupyterhub-ssh:
+  sftp:
+    enabled: false
diff --git a/debug-pod.yaml b/debug-pod.yaml
index b456001eaa..53a6d76a7d 100644
--- a/debug-pod.yaml
+++ b/debug-pod.yaml
@@ -19,46 +19,46 @@ spec:
   affinity:
     nodeAffinity:
       preferredDuringSchedulingIgnoredDuringExecution:
-      - preference:
-          matchExpressions:
-          - key: hub.jupyter.org/node-purpose
-            operator: In
-            values:
-            - user
-        weight: 100
+        - preference:
+            matchExpressions:
+              - key: hub.jupyter.org/node-purpose
+                operator: In
+                values:
+                  - user
+          weight: 100
   containers:
-  - args:
-    - jupyterhub-singleuser
-    - --ip=0.0.0.0
-    - --port=8888
-    - --SingleUserNotebookApp.default_url=/lab
-    image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:c6d9558
-    name: notebook
-    ports:
-    - containerPort: 8888
-      name: notebook-port
-      protocol: TCP
-    resources:
-      requests:
-        cpu: 225m
-        memory: "939524096"
+    - args:
+        - jupyterhub-singleuser
+        - --ip=0.0.0.0
+        - --port=8888
+        - --SingleUserNotebookApp.default_url=/lab
+      image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:c6d9558
+      name: notebook
+      ports:
+        - containerPort: 8888
+          name: notebook-port
+          protocol: TCP
+      resources:
+        requests:
+          cpu: 225m
+          memory: "939524096"
   initContainers:
-  - command:
-    - sh
-    - -c
-    - id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan & sleep infinity
-    image: busybox
-    imagePullPolicy: Always
-    name: volume-mount-ownership-fix
-    securityContext:
-      runAsUser: 0
-    volumeMounts:
-    - mountPath: /home/jovyan
-      name: home
-      subPath: fperez
-    - mountPath: /home/jovyan/shared
-      name: home
-      subPath: _shared
+    - command:
+        - sh
+        - -c
+        - id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan & sleep infinity
+      image: busybox
+      imagePullPolicy: Always
+      name: volume-mount-ownership-fix
+      securityContext:
+        runAsUser: 0
+      volumeMounts:
+        - mountPath: /home/jovyan
+          name: home
+          subPath: fperez
+        - mountPath: /home/jovyan/shared
+          name: home
+          subPath: _shared
   nodeSelector:
     2i2c.org/node-cpu: "4"
   priority: 0
@@ -71,23 +71,23 @@ spec:
   serviceAccountName: s3-full-access
   terminationGracePeriodSeconds: 30
   tolerations:
-  - effect: NoSchedule
-    key: hub.jupyter.org/dedicated
-    operator: Equal
-    value: user
-  - effect: NoSchedule
-    key: hub.jupyter.org_dedicated
-    operator: Equal
-    value: user
-  - effect: NoExecute
-    key: node.kubernetes.io/not-ready
-    operator: Exists
-    tolerationSeconds: 300
-  - effect: NoExecute
-    key: node.kubernetes.io/unreachable
-    operator: Exists
-    tolerationSeconds: 300
+    - effect: NoSchedule
+      key: hub.jupyter.org/dedicated
+      operator: Equal
+      value: user
+    - effect: NoSchedule
+      key: hub.jupyter.org_dedicated
+      operator: Equal
+      value: user
+    - effect: NoExecute
+      key: node.kubernetes.io/not-ready
+      operator: Exists
+      tolerationSeconds: 300
+    - effect: NoExecute
+      key: node.kubernetes.io/unreachable
+      operator: Exists
+      tolerationSeconds: 300
   volumes:
-  - name: home
-    persistentVolumeClaim:
-      claimName: home-nfs
\ No newline at end of file
+    - name: home
+      persistentVolumeClaim:
+        claimName: home-nfs
diff --git a/deployer/cluster.py b/deployer/cluster.py
index 91de692529..2a451fd2d6 100644
--- a/deployer/cluster.py
+++ b/deployer/cluster.py
@@ -31,7 +31,7 @@ def auth(self):
             yield from self.auth_azure()
         elif self.spec["provider"] == "kubeconfig":
             yield from self.auth_kubeconfig()
-        elif self.spec['provider'] == 'none':
+        elif self.spec["provider"] == "none":
             yield
         else:
             raise ValueError(f'Provider {self.spec["provider"]} not supported')
diff --git a/deployer/cluster.schema.yaml b/deployer/cluster.schema.yaml
index 19b2f58de2..f14a7f3691 100644
--- a/deployer/cluster.schema.yaml
+++ b/deployer/cluster.schema.yaml
@@ -27,7 +27,7 @@ properties:
       Cloud provider this cluster is running on. Used to perform
       authentication against the cluster. Currently supports gcp, aws, azure,
       and raw kubeconfig files.
-    enum: 
+    enum:
       - none
       - gcp
       - kubeconfig
diff --git a/eksctl/cloudformation-extras.yaml b/eksctl/cloudformation-extras.yaml
index c80f541be8..2bf9fa60ce 100644
--- a/eksctl/cloudformation-extras.yaml
+++ b/eksctl/cloudformation-extras.yaml
@@ -78,7 +78,6 @@ Parameters:
     Type: String
     Default: ci-eks
 
-
 # The resources we want to be created as part of this cloudformation stack
 Resources:
   # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-iam-user.html
@@ -115,8 +114,8 @@ Resources:
             Effect: Allow
             Principal:
               AWS: !Join
-                - ''
-                - - 'arn:aws:iam::'
+                - ""
+                - - "arn:aws:iam::"
                   - !Ref AWS::AccountId
                   - :user/
                   - !Ref IamUser
@@ -135,13 +134,13 @@ Resources:
                 Action:
                   - eks:DescribeCluster
                 Resource: !Join
-                - ''
-                - - 'arn:aws:eks:'
-                  - !Ref AWS::Region
-                  - ':'
-                  - !Ref AWS::AccountId
-                  - ':cluster/'
-                  - !Ref EksClusterName
+                  - ""
+                  - - "arn:aws:eks:"
+                    - !Ref AWS::Region
+                    - ":"
+                    - !Ref AWS::AccountId
+                    - ":cluster/"
+                    - !Ref EksClusterName
       AssumeRolePolicyDocument:
         Version: 2012-10-17
         Statement:
@@ -149,8 +148,8 @@ Resources:
             Effect: Allow
             Principal:
               AWS: !Join
-                - ''
-                - - 'arn:aws:iam::'
+                - ""
+                - - "arn:aws:iam::"
                   - !Ref AWS::AccountId
                   - :user/
                   - !Ref IamUser
@@ -170,22 +169,91 @@ Resources:
     Properties:
       FileSystemId: !GetAtt EfsFileSystem.FileSystemId
       SecurityGroups:
-        - {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
-      SubnetId: { "Fn::Select": [0, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+        - {
+            "Fn::ImportValue":
+              {
+                "Fn::Sub": "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup",
+              },
+          }
+      SubnetId:
+        {
+          "Fn::Select":
+            [
+              0,
+              {
+                "Fn::Split":
+                  [
+                    ",",
+                    {
+                      "Fn::ImportValue":
+                        {
+                          "Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic",
+                        },
+                    },
+                  ],
+              },
+            ],
+        }
   EfsMountTarget1:
     Type: AWS::EFS::MountTarget
     Properties:
       FileSystemId: !GetAtt EfsFileSystem.FileSystemId
       SecurityGroups:
-        - {"Fn::ImportValue": {"Fn::Sub" : "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
-      SubnetId: { "Fn::Select": [1, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+        - {
+            "Fn::ImportValue":
+              {
+                "Fn::Sub": "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup",
+              },
+          }
+      SubnetId:
+        {
+          "Fn::Select":
+            [
+              1,
+              {
+                "Fn::Split":
+                  [
+                    ",",
+                    {
+                      "Fn::ImportValue":
+                        {
+                          "Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic",
+                        },
+                    },
+                  ],
+              },
+            ],
+        }
   EfsMountTarget2:
     Type: AWS::EFS::MountTarget
     Properties:
       FileSystemId: !GetAtt EfsFileSystem.FileSystemId
       SecurityGroups:
-        - {"Fn::ImportValue": {"Fn::Sub" : "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup"}}
-      SubnetId: { "Fn::Select": [2, { "Fn::Split": [",", {"Fn::ImportValue": {"Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic"}}]}] }
+        - {
+            "Fn::ImportValue":
+              {
+                "Fn::Sub": "eksctl-${EksClusterName}-cluster::SharedNodeSecurityGroup",
+              },
+          }
+      SubnetId:
+        {
+          "Fn::Select":
+            [
+              2,
+              {
+                "Fn::Split":
+                  [
+                    ",",
+                    {
+                      "Fn::ImportValue":
+                        {
+                          "Fn::Sub": "eksctl-${EksClusterName}-cluster::SubnetsPublic",
+                        },
+                    },
+                  ],
+              },
+            ],
+        }
 
   # ref: https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-s3-bucket.html
   #
@@ -225,18 +293,18 @@ Resources:
             Effect: Allow
             Principal:
               AWS: !Join
-                - ''
-                - - 'arn:aws:iam::'
+                - ""
+                - - "arn:aws:iam::"
                   - !Ref AWS::AccountId
                   - :root
-            Action: 'kms:*'
-            Resource: '*'
+            Action: "kms:*"
+            Resource: "*"
           - Sid: Enable User Permissions
             Effect: Allow
             Principal:
               AWS: !Join
-                - ''
-                - - 'arn:aws:iam::'
+                - ""
+                - - "arn:aws:iam::"
                   - !Ref AWS::AccountId
                   - :user/
                   - !Ref IamUser
@@ -247,8 +315,7 @@ Resources:
               - "kms:ReEncrypt*"
               - "kms:GenerateDataKey"
               - "kms:GenerateDataKeyWithoutPlaintext"
-            Resource: '*'
-
+            Resource: "*"
 
 # The relevant information from the created resources.
 Outputs:
@@ -264,7 +331,7 @@ Outputs:
       The Role with permission to push to our image registry.
   EcrRepository:
     Value: !Join
-      - ''
+      - ""
       - - !Ref AWS::AccountId
         - .dkr.ecr.
         - !Ref AWS::Region
diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index b97ddee8be..2806d04454 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -26,8 +26,6 @@
 # ref: https://github.com/2i2c-org/pangeo-hubs/blob/8e552bc198d8339efe8c003cb847849255e8f8ed/aws/eksctl-config.yaml
 #
 
-
-
 apiVersion: eksctl.io/v1alpha5
 kind: ClusterConfig
 metadata:
@@ -48,8 +46,6 @@ metadata:
 #
 availabilityZones: [us-west-2d, us-west-2b, us-west-2a]
 
-
-
 # This section will create additional k8s ServiceAccount's that are coupled with
 # AWS Role's. By declaring pods to use them, you can grant these pods the
 # associated permissions. For this deployment, we create a k8s ServiceAccount
@@ -57,7 +53,7 @@ availabilityZones: [us-west-2d, us-west-2b, us-west-2a]
 # pods will make use of.
 #
 iam:
-  withOIDC: true        # https://eksctl.io/usage/security/#withoidc
+  withOIDC: true # https://eksctl.io/usage/security/#withoidc
   # serviceAccounts like nodeGroups etc can be managed directly with eksctl, for
   # more information, see: https://eksctl.io/usage/iamserviceaccounts/
   #
@@ -79,8 +75,6 @@ iam:
       attachPolicyARNs:
         - arn:aws:iam::aws:policy/AmazonS3FullAccess
 
-
-
 # Choose the type of node group?
 # - nodeGroups cannot be updated but must be recreated on changes:
 #   https://eksctl.io/usage/managing-nodegroups/#nodegroup-immutability
@@ -111,8 +105,8 @@ iam:
 #
 nodeGroups:
   - name: core-a
-    availabilityZones: [us-west-2d]   # aws ec2 describe-availability-zones --region <region-name>
-    instanceType: m5.large   # 28 pods, 2 cpu, 8 GB
+    availabilityZones: [us-west-2d] # aws ec2 describe-availability-zones --region <region-name>
+    instanceType: m5.large # 28 pods, 2 cpu, 8 GB
     minSize: 0
     maxSize: 2
     desiredCapacity: 1
@@ -325,8 +319,6 @@ nodeGroups:
         autoScaler: true
         efs: true
 
-
-
   # Worker node pools using cheaper spot instances that are temporary.
   #
   #   References:
@@ -346,7 +338,8 @@ nodeGroups:
   #       and was just part of YAML 1.1 but not 1.0 or 1.2.
   #
   - name: worker-a-4
-    availabilityZones: &worker-availabilityZones [us-west-2d, us-west-2b, us-west-2a]
+    availabilityZones:
+      &worker-availabilityZones [us-west-2d, us-west-2b, us-west-2a]
     minSize: &worker-minSize 0
     maxSize: &worker-maxSize 8
     desiredCapacity: &worker-desiredCapacity 0
@@ -367,8 +360,8 @@ nodeGroups:
     # Spot instance specific configuration
     instancesDistribution:
       instanceTypes:
-        - m5a.xlarge      # 57 pods, 4 cpu, 16 GB (AMD,   10 GBits network,  100% cost)
-        - m5.xlarge       # 57 pods, 4 cpu, 16 GB (Intel, 10 GBits network, ~112% cost)
+        - m5a.xlarge # 57 pods, 4 cpu, 16 GB (AMD,   10 GBits network,  100% cost)
+        - m5.xlarge # 57 pods, 4 cpu, 16 GB (Intel, 10 GBits network, ~112% cost)
         # - m5n.xlarge    # 57 pods, 4 cpu, 16 GB (Intel, 25 GBits network, ~139% cost)
       onDemandBaseCapacity: &worker-onDemandBaseCapacity 0
       onDemandPercentageAboveBaseCapacity: &worker-onDemandPercentageAboveBaseCapacity 0
@@ -392,8 +385,8 @@ nodeGroups:
     iam: *worker-iam
     instancesDistribution:
       instanceTypes:
-        - m5a.4xlarge     # 233 pods, 16 cpu, 64 GB (AMD,   10 GBits network,  100% cost)
-        - m5.4xlarge      # 233 pods, 16 cpu, 64 GB (Intel, 10 GBits network, ~112% cost)
+        - m5a.4xlarge # 233 pods, 16 cpu, 64 GB (AMD,   10 GBits network,  100% cost)
+        - m5.4xlarge # 233 pods, 16 cpu, 64 GB (Intel, 10 GBits network, ~112% cost)
         # - m5n.4xlarge   # 233 pods, 16 cpu, 64 GB (Intel, 25 GBits network, ~139% cost)
       onDemandBaseCapacity: *worker-onDemandBaseCapacity
       onDemandPercentageAboveBaseCapacity: *worker-onDemandPercentageAboveBaseCapacity
@@ -417,8 +410,8 @@ nodeGroups:
     iam: *worker-iam
     instancesDistribution:
       instanceTypes:
-        - m5a.16xlarge     # 736 pods, 64 cpu, 256 GB (AMD,   12 GBits network,  100% cost)
-        - m5.16xlarge      # 736 pods, 64 cpu, 256 GB (Intel, 20 GBits network, ~112% cost)
+        - m5a.16xlarge # 736 pods, 64 cpu, 256 GB (AMD,   12 GBits network,  100% cost)
+        - m5.16xlarge # 736 pods, 64 cpu, 256 GB (Intel, 20 GBits network, ~112% cost)
         # - m5n.16xlarge   # 736 pods, 64 cpu, 256 GB (Intel, 75 GBits network, ~139% cost)
       onDemandBaseCapacity: *worker-onDemandBaseCapacity
       onDemandPercentageAboveBaseCapacity: *worker-onDemandPercentageAboveBaseCapacity
diff --git a/shared/deployer/jmte.cluster.yaml b/shared/deployer/jmte.cluster.yaml
deleted file mode 100644
index af7b541699..0000000000
--- a/shared/deployer/jmte.cluster.yaml
+++ /dev/null
@@ -1,383 +0,0 @@
-name: jmte
-provider: none
-# kubeconfig:
-#   file: secrets/jmte.yaml
-hubs:
-  - name: prod
-    domain: hub.jupytearth.org
-    template: daskhub
-    auth0:
-      connection: github
-    config: &config
-
-      basehub:
-        # Cloudformation: The EFS filesystem was created by cloudformation.
-        #
-        nfs:
-          enabled: true
-          shareCreator:
-            enabled: true
-          pv:
-            serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
-            # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
-            mountOptions:
-              - rsize=1048576
-              - wsize=1048576
-              - timeo=600
-              - soft # We pick soft over hard, so NFS lockups don't lead to hung processes
-              - retrans=2
-              - noresvport
-            # baseShareName is required to be just "/" so that we can create
-            # various sub folders in the filesystem that our PV to access the
-            # NFS server can reference successfully as it isn't supported to
-            # access a not yet existing folder. This creation is automated by
-            # the nfs-share-creator resource part of the basehub Helm chart.
-            baseShareName: /
-
-
-
-        jupyterhub:
-          custom:
-            homepage:
-              templateVars:
-                org:
-                  name: Jupyter meets the Earth
-                  logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
-                  url: https://jupytearth.org
-                designed_by:
-                  name: 2i2c
-                  url: https://2i2c.org
-                operated_by:
-                  name: 2i2c
-                  url: https://2i2c.org
-                funded_by:
-                  name: Jupyter meets the Earth
-                  url: https://jupytearth.org
-
-          singleuser:
-            # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
-            extraFiles:
-              # github-app-user-auth requires:
-              # - Installed python package
-              # - GITHUB_APP_CLIENT_ID environment set
-              # - This configuration
-              #
-              # NOTE: an associated GitHub App has been created by Erik Sundell
-              #       aka. @consideRatio and can be configured by him at:
-              #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
-              #
-              gitconfig:
-                mountPath: /etc/gitconfig
-                stringData: |
-                  [credential]
-                      helper = store --file=/tmp/github-app-git-credentials
-              jupyter_notebook_config.json:
-                mountPath: /etc/jupyter/jupyter_notebook_config.json
-                data:
-                  # Allow jupyterlab option to show hidden files in browser
-                  # https://github.com/berkeley-dsep-infra/datahub/issues/3160
-                  ContentsManager:
-                    allow_hidden: true
-
-            # Eksctl: The service account was created by eksctl.
-            #
-            serviceAccountName: &user-sa s3-full-access
-
-            # cmd: Note the default in z2jh is jupyterhub-singleuser.
-            cmd:
-              - jupyterhub-singleuser
-              # FIXME: Collaborative mode is disabled due to critical issues
-              #        reported in
-              #        https://discourse.jupyter.org/t/plans-on-bringing-rtc-to-jupyterhub/9813/13
-              #        seem to remain according to Tasha Snow.
-              #
-              #        These issues may be resolved by
-              #        https://github.com/jupyterlab/jupyterlab/pull/11599.
-              #
-              # - --LabApp.collaborative=True
-
-            # Increased as we have experienced a too slow image pull at least
-            # once. Our pods can take ~6-7 minutes to start on a new node it
-            # seems, so this gives us some margin.
-            startTimeout: 900
-
-            extraEnv:
-              # SCRATCH_BUCKET / PANGEO_SCRATCH are environment variables that
-              # help users write notebooks and such referencing this environment
-              # variable in a way that will work between users.
-              #
-              # $(ENV_VAR) will by evaluated by k8s automatically
-              #
-              # Cloudformation: The s3 bucket was created by cloudformation.
-              #
-              SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER)
-              PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER)
-              # GITHUB_APP_CLIENT_ID, see notes in singleuser.extraFiles about
-              # this environment variable. Two entries are created as I think
-              # the shorter may be deprecated soon.
-              GITHUB_APP_CLIENT_ID: Iv1.a073b1649637af12
-              GITHUB_APP_USER_AUTH_CLIENT_ID: Iv1.a073b1649637af12
-            initContainers:
-              # Need to explicitly fix ownership here, since EFS doesn't do anonuid
-              - name: volume-mount-ownership-fix
-                image: busybox
-                command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan"]
-                securityContext:
-                  runAsUser: 0
-                volumeMounts:
-                  - name: home
-                    mountPath: /home/jovyan
-                    subPath: "{username}"
-                  - name: home
-                    mountPath: /home/jovyan/shared
-                    subPath: _shared
-
-            image:
-              name: pangeo/pangeo-notebook
-              tag: "2021.05.15" # https://hub.docker.com/r/pangeo/pangeo-notebook/tags
-
-            profileList:
-              - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
-                default: True
-                description: "A shared machine, the recommended option until you experience a limitation."
-                kubespawner_override:
-                  cpu_guarantee: 0.225
-                  mem_guarantee: 0.875G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "4" }
-                  extra_resource_limits: {}
-              - display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
-                description: "A shared machine."
-                kubespawner_override:
-                  cpu_guarantee: 0.875
-                  mem_guarantee: 3.5G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "4" }
-                  extra_resource_limits: {}
-              - display_name: "Medium: 4 CPU, 16 GB"
-                description: "A dedicated machine for you."
-                kubespawner_override:
-                  cpu_guarantee: 3.5
-                  mem_guarantee: 14G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "4" }
-                  extra_resource_limits: {}
-              - display_name: "Large: 16 CPU, 64 GB"
-                description: "A dedicated machine for you."
-                kubespawner_override:
-                  mem_guarantee: 56G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "16" }
-                  extra_resource_limits: {}
-              - display_name: "Massive: 64 CPU, 256 GB"
-                description: "A dedicated machine for you."
-                kubespawner_override:
-                  mem_guarantee: 224G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "64" }
-                  extra_resource_limits: {}
-              - display_name: "Massive high-memory: 64 CPU, 976 GB"
-                description: "A dedicated machine for you."
-                kubespawner_override:
-                  mem_guarantee: 900G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-highmem-cpu: "64" }
-                  extra_resource_limits: {}
-              - display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
-                description: "A dedicated machine for you with one GPU attached."
-                kubespawner_override:
-                  cpu_guarantee: 3.5
-                  mem_guarantee: 14G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "4", 2i2c.org/node-gpu: "1" }
-                  extra_resource_limits:
-                    nvidia.com/gpu: "1"
-              - display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
-                description: "A dedicated machine for you with one GPU attached."
-                kubespawner_override:
-                  mem_guarantee: 56G
-                  mem_limit: null
-                  node_selector: { 2i2c.org/node-cpu: "16", 2i2c.org/node-gpu: "1" }
-                  extra_resource_limits:
-                    nvidia.com/gpu: "1"
-
-          proxy:
-            # proxy notes:
-            #
-            # - Revert basehubs overrides as we don't install ingress-nginx and
-            #   cert-manager yet, and therefore should use
-            #   service.type=LoadBalancer instead of service.type=ClusterIP.
-            #   Along with this, we also make use of the autohttps system that
-            #   requires us to configure an letsencrypt email.
-            #
-            https:
-              enabled: true
-              type: letsencrypt
-              letsencrypt:
-                contactEmail: erik@sundellopensource.se
-
-            service:
-              # Revert an unwanted basehub default
-              type: LoadBalancer
-              annotations:
-                service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
-
-              # jupyterhub-ssh/sftp integration part 1/3:
-              #
-              # We must accept traffic to the k8s Service (proxy-public) receiving traffic
-              # from the internet. Port 22 is typically used for both SSH and SFTP, but we
-              # can't use the same port for both so we use 2222 for SFTP in this example.
-              #
-              extraPorts:
-                - name: ssh
-                  port: 22
-                  targetPort: ssh
-                - name: sftp
-                  port: 2222
-                  targetPort: sftp
-            traefik:
-              # jupyterhub-ssh/sftp integration part 2/3:
-              #
-              # We must accept traffic arriving to the autohttps pod (traefik) from the
-              # proxy-public service. Expose a port and update the NetworkPolicy
-              # to tolerate incoming (ingress) traffic on the exposed port.
-              #
-              extraPorts:
-                - name: ssh
-                  containerPort: 8022
-                - name: sftp
-                  containerPort: 2222
-              networkPolicy:
-                allowedIngressPorts: [http, https, ssh, sftp]
-              # jupyterhub-ssh/sftp integration part 3/3:
-              #
-              # We must let traefik know it should listen for traffic (traefik entrypoint)
-              # and route it (traefik router) onwards to the jupyterhub-ssh k8s Service
-              # (traefik service).
-              #
-              extraStaticConfig:
-                entryPoints:
-                  ssh-entrypoint:
-                    address: :8022
-                  sftp-entrypoint:
-                    address: :2222
-              extraDynamicConfig:
-                tcp:
-                  services:
-                    ssh-service:
-                      loadBalancer:
-                        servers:
-                          - address: jupyterhub-ssh:22
-                    sftp-service:
-                      loadBalancer:
-                        servers:
-                          - address: jupyterhub-sftp:22
-                  routers:
-                    ssh-router:
-                      entrypoints: [ssh-entrypoint]
-                      rule: HostSNI(`*`)
-                      service: ssh-service
-                    sftp-router:
-                      entrypoints: [sftp-entrypoint]
-                      rule: HostSNI(`*`)
-                      service: sftp-service
-
-
-
-          hub:
-            config:
-              Authenticator:
-                allowed_users: &users
-                  # This is just listing a few of the users/admins, a lot of
-                  # users has been added manually, see:
-                  # https://github.com/pangeo-data/jupyter-earth/issues/53
-                  - abbyazari       # Abby Azari
-                  - andersy005      # Anderson Banihirwe
-                  - consideratio    # Erik Sundell
-                  - choldgraf       # Chris Holdgraf
-                  - elliesch        # Ellie Abrahams
-                  - EMscience       # Edom Moges
-                  - espg            # Shane Grigsby
-                  - facusapienza21  # Facundo Sapienza
-                  - fperez          # Fernando Pérez
-                  - kmpaul          # Kevin Paul
-                  - lrennels        # Lisa Rennels
-                  - mrsiegfried     # Matthew Siegfried
-                  - tsnow03         # Tasha Snow
-                  - whyjz           # Whyjay Zheng
-                  - yuvipanda       # Yuvi Panda
-                  - jonathan-taylor # Jonathan Taylor
-                admin_users: *users
-            allowNamedServers: true
-            networkPolicy:
-              # FIXME: Required for dask gateway 0.9.0. It is fixed but a Helm
-              #        chart of newer version is not yet released.
-              enabled: false
-
-
-
-      dask-gateway:
-        # dask-gateway notes:
-        #
-        # - Explicitly unset daskhub's nodeSelectors for all pods except the
-        #   worker pods. The tolerations applied in the basehub config to all
-        #   non-worker pods in dask-gateway will provide a preferred affinity
-        #   towards suitable nodes without needing to have a label on them. Then
-        #   we use the node label "k8s.dask.org/node-purpose: worker"
-        #   specifically for enforce workers to schedule on such nodes.
-        #
-        traefik:
-          nodeSelector: null
-        controller:
-          nodeSelector: null
-        gateway:
-          nodeSelector: null
-          backend:
-            scheduler:
-              # IMPORTANT: We have experienced that the scheduler can fail with
-              #            1GB memory limit. This was observed "stream closed"
-              #            from the python client working against the
-              #            Dask-Gateway created DaskCluster.
-              #
-              #            CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
-              #
-              cores:
-                request: 1
-                limit: 64
-              memory:
-                request: 2G
-                limit: 500G
-              extraPodConfig:
-                nodeSelector:
-                  hub.jupyter.org/node-purpose: user
-                  k8s.dask.org/node-purpose: null
-                serviceAccountName: *user-sa
-            worker:
-              extraPodConfig:
-                nodeSelector:
-                  k8s.dask.org/node-purpose: worker
-                serviceAccountName: *user-sa
-
-          extraConfig:
-            idle: |
-              # timeout after 30 minutes of inactivity
-              c.KubeClusterConfig.idle_timeout = 1800
-            limits: |
-              # per Dask cluster limits.
-              c.ClusterConfig.cluster_max_cores = 256
-              c.ClusterConfig.cluster_max_memory = "1028G"
-
-      # jupyterhub-ssh values.yaml reference:
-      # https://github.com/yuvipanda/jupyterhub-ssh/blob/main/helm-chart/jupyterhub-ssh/values.yaml
-      #
-      jupyterhub-ssh:
-        hubUrl: http://proxy-http:8000
-
-        ssh:
-          enabled: true
-
-        sftp:
-          enabled: true
-          pvc:
-            enabled: true
-            name: home-nfs

From ce1652272e001224da87bb2dfb83fde9a6006c24 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 21 Apr 2022 20:18:58 +0200
Subject: [PATCH 21/43] jmte: add recent changes

---
 config/clusters/jmte/cluster.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/clusters/jmte/cluster.yaml b/config/clusters/jmte/cluster.yaml
index 2b916d0f5b..1489ccd8cd 100644
--- a/config/clusters/jmte/cluster.yaml
+++ b/config/clusters/jmte/cluster.yaml
@@ -10,10 +10,11 @@ hubs:
       - common.values.yaml
       - staging.values.yaml
   - name: prod
+    display_name: "Jupyter Meets the Earth"
     domain: hub.jupytearth.org
-    helm_chart: daskhub
     auth0:
       connection: github
+    helm_chart: daskhub
     helm_chart_values_files:
       - common.values.yaml
       - prod.values.yaml

From ec6046bd9f694a3058261d4ad5e666102cae4d9d Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Fri, 29 Apr 2022 14:00:27 +0200
Subject: [PATCH 22/43] jmte: k8s 1.19 -> 1.22 upgrade and notes

---
 eksctl/eksctl-cluster-config.yaml | 66 +++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 2806d04454..24e788b9fc 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -16,6 +16,7 @@
 #
 #   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --approve
 #   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-a-*,worker-a-*" --install-nvidia-plugin=false
+#   eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-gpu-a-*" --approve
 #   eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "user-gpu-a-*" --install-nvidia-plugin=false
 #   eksctl delete nodegroup --cluster jmte --name core-a --approve
 #   eksctl create nodegroup --cluster jmte --name core-a --install-nvidia-plugin=false
@@ -35,7 +36,60 @@ metadata:
   #   dataset.
   #
   region: us-west-2
-  version: "1.19"
+  # version:
+  #   The k8s control plane version, to upgrade this, see
+  #   https://eksctl.io/usage/cluster-upgrade/.
+  #
+  # For reference, this is the steps I took when upgrading from k8s 1.19 to k8s
+  # 1.22, April 29th 2022.
+  #
+  # 1. Updated the version field in this config from 1.19 to 1.20
+  #
+  #    - It is not allowed to upgrade the control plane more than one minor at the time
+  #
+  # 2. Upgraded the control plane (takes ~10 minutes)
+  #
+  #    eksctl upgrade cluster --config-file eksctl-cluster-config.yaml --approve
+  #
+  # 2. Deleted all non-core nodegroups
+  #
+  #    eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-*,worker-*" --approve
+  #
+  # 3. Updated the version field in this config from 1.20 to 1.22
+  #
+  #    - It is allowed to have a nodegroup +-2 minors away from the control plan version
+  #
+  # 4. Created a new core nodepool (core-b)
+  #
+  #    eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "core-b" --install-nvidia-plugin=false
+  #
+  # 5. Deleted the old core nodepool (core-a)
+  #
+  #    eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "core-a" --approve
+  #
+  # 6. Upgraded add-ons (takes ~3*5s)
+  #
+  #    eksctl utils update-kube-proxy --cluster=jmte --approve
+  #    eksctl utils update-aws-node --cluster=jmte --approve
+  #    eksctl utils update-coredns --cluster=jmte --approve
+  #
+  # 7. Update the version field in this config from 1.22 to 1.21
+  #
+  # 8. Upgraded the control plane, as in step 2.
+  #
+  # 9. Upgraded add-ons, as in step 6.
+  #
+  # A. Update the version field in this config from 1.21 to 1.22
+  #
+  # B. Upgraded the control plane, as in step 2.
+  #
+  # C. Upgraded add-ons, as in step 6.
+  #
+  # D. Recreated all nodegroups
+  #
+  #    eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "*" --install-nvidia-plugin=false
+  #
+  version: "1.22"
   tags:
     2i2c.org/project: jmte
 
@@ -104,7 +158,7 @@ iam:
 #   you have run into a quota issue. Following that, you make a request to AWS using provided link: https://aws.amazon.com/contact-us/ec2-request
 #
 nodeGroups:
-  - name: core-a
+  - name: core-b
     availabilityZones: [us-west-2d] # aws ec2 describe-availability-zones --region <region-name>
     instanceType: m5.large # 28 pods, 2 cpu, 8 GB
     minSize: 0
@@ -248,12 +302,8 @@ nodeGroups:
   #   operator: Equal
   #   value: user
   #
-  # Sadly, something is making this change reset. I don't know why, but I
-  # suspect it happens whenever I do something with eksctl - perhaps whenever I
-  # do something with the nodegroup realted to GPU nodes. I think it resets
-  # whenever a GPU based nodegroup is created unless
-  # --install-nvidia-plugin=false it passed to the `eksctl create nodegroup`
-  # command.
+  # WARNING: If you create any nodegroup without --install-nvidia-plugin=false,
+  #          the daemonset will reset and this change will be lost.
   #
   # It seems I may need to specify additional tags also, with associated value
   # for the GPU of choice:

From c7619edf3d19b6a77b14863ee367b4543ef63ae7 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Wed, 8 Jun 2022 19:28:50 +0200
Subject: [PATCH 23/43] jmte: adjust dask-gateway config options

---
 config/clusters/jmte/common.values.yaml | 71 ++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 2 deletions(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index 1aa0faf34f..505b18e6c5 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -292,14 +292,81 @@ dask-gateway:
           # serviceAccountName is adjusted by staging/prod values
           # serviceAccountName: *user-sa
 
+    # Note that we are overriding options provided in 2i2c's helm chart that has
+    # default values for these config entries.
+    #
     extraConfig:
+      # This configuration represents options that can be presented to users
+      # that want to create a Dask cluster using dask-gateway. For more
+      # details, see https://gateway.dask.org/cluster-options.html
+      #
+      # The goal is to provide a simple configuration that allow the user some
+      # flexibility while also fitting well well on AWS nodes that are all
+      # having 1:4 ratio between CPU and GB of memory. By providing the
+      # username label, we help administrators to track user pods.
+      option_handler: |
+        from dask_gateway_server.options import Options, Select, String, Mapping
+        def cluster_options(user):
+            def option_handler(options):
+                if ":" not in options.image:
+                    raise ValueError("When specifying an image you must also provide a tag")
+
+                extra_labels = {}
+                extra_annotations = {
+                    "prometheus.io/scrape": "true",
+                    "prometheus.io/port": "8787",
+                }
+                chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
+                chosen_worker_memory = 4 * chosen_worker_cpu
+
+                # We multiply the requests by a fraction to ensure that the
+                # worker fit well within a node that need some resources
+                # reserved for system pods.
+                return {
+                    # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                    "image": options.image,
+                    "scheduler_extra_pod_labels": extra_labels,
+                    "scheduler_extra_pod_annotations": extra_annotations,
+                    "worker_extra_pod_labels": extra_labels,
+                    "worker_extra_pod_annotations": extra_annotations,
+                    "worker_cores": 0.85 * chosen_worker_cpu,
+                    "worker_cores_limit": chosen_worker_cpu,
+                    "worker_memory": "%fG" % (0.85 * chosen_worker_memory),
+                    "worker_memory_limit": "%fG" % chosen_worker_memory,
+                    "environment": options.environment,
+                }
+            return Options(
+                Select(
+                    "worker_specification",
+                    [
+                        "1CPU, 4GB",
+                        "2CPU, 8GB",
+                        "4CPU, 16GB",
+                        "8CPU, 32GB",
+                        "16CPU, 64GB",
+                        "32CPU, 128GB",
+                        "64CPU, 256GB",
+                    ],
+                    default="1CPU, 4GB",
+                    label="Worker specification",
+                ),
+                # The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
+                String("image", label="Image"),
+                Mapping("environment", {}, label="Environment variables"),
+                handler=option_handler,
+            )
+        c.Backend.cluster_options = cluster_options
       idle: |
         # timeout after 30 minutes of inactivity
         c.KubeClusterConfig.idle_timeout = 1800
       limits: |
         # per Dask cluster limits.
-        c.ClusterConfig.cluster_max_cores = 256
-        c.ClusterConfig.cluster_max_memory = "1028G"
+        #
+        # Limits removed for JMTE as I think they could hamper Shane Griggsby's
+        # work with powerful dask clusters.
+        #
+        # c.ClusterConfig.cluster_max_cores = 256
+        # c.ClusterConfig.cluster_max_memory = "1028G"
 
 # jupyterhub-ssh values.yaml reference:
 # https://github.com/yuvipanda/jupyterhub-ssh/blob/main/helm-chart/jupyterhub-ssh/values.yaml

From 93fd326dc05c627548b294eefeed123996e569b8 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 16 Jun 2022 09:42:33 +0200
Subject: [PATCH 24/43] jmte: update to gh-scoped-creds

---
 config/clusters/jmte/common.values.yaml | 30 +++++++++----------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index 505b18e6c5..c2abb33a43 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -51,20 +51,6 @@ basehub:
     singleuser:
       # extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
       extraFiles:
-        # github-app-user-auth requires:
-        # - Installed python package
-        # - GITHUB_APP_CLIENT_ID environment set
-        # - This configuration
-        #
-        # NOTE: an associated GitHub App has been created by Erik Sundell
-        #       aka. @consideRatio and can be configured by him at:
-        #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
-        #
-        gitconfig:
-          mountPath: /etc/gitconfig
-          stringData: |
-            [credential]
-                helper = store --file=/tmp/github-app-git-credentials
         jupyter_notebook_config.json:
           mountPath: /etc/jupyter/jupyter_notebook_config.json
           data:
@@ -84,11 +70,17 @@ basehub:
       startTimeout: 900
 
       extraEnv:
-        # GITHUB_APP_CLIENT_ID, see notes in singleuser.extraFiles about
-        # this environment variable. Two entries are created as I think
-        # the shorter may be deprecated soon.
-        GITHUB_APP_CLIENT_ID: Iv1.a073b1649637af12
-        GITHUB_APP_USER_AUTH_CLIENT_ID: Iv1.a073b1649637af12
+        # github-app-user-auth requires:
+        # - Installed python package
+        # - GH_SCOPED_CREDS_APP_URL env var set
+        # - GITHUB_APP_CLIENT_ID env var set
+        #
+        # NOTE: an associated GitHub App has been created by Erik Sundell aka.
+        #       @consideRatio and can be configured by him at:
+        #       https://github.com/settings/apps/hub-jupytearth-org-github-integ
+        #
+        GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
+        GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12
 
       image:
         # NOTE: We use the jupyterhub-configurator so this image/tag is not

From 10122167cb746f0be0f662ecc4fc24396d060bbc Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 16 Jun 2022 10:51:01 +0200
Subject: [PATCH 25/43] jmte: configure 1 small node placeholder pod

---
 config/clusters/jmte/prod.values.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index 9ff43eca6a..66a2fce789 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -3,6 +3,15 @@ basehub:
     enabled: true
 
   jupyterhub:
+    scheduling:
+      userPlaceholder:
+        enabled: true
+        replicas: 1
+        resources:
+          requests:
+            cpu: 2.5
+            memory: 14G
+
     singleuser:
       # Eksctl: The service account was created by eksctl.
       #

From 5b74730869afa0075425641b842d2dc7a3987582 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 16 Jun 2022 11:01:15 +0200
Subject: [PATCH 26/43] jmte: Re-enable the continuous image puller

---
 config/clusters/jmte/prod.values.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index 66a2fce789..30fc22bace 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -3,6 +3,13 @@ basehub:
     enabled: true
 
   jupyterhub:
+    # Reverts changes in basehub configuration to the z2jh defaults and ensures
+    # 1 pod is used as a placeholder pod, sized as the smallest node in the JMTE
+    # cluster.
+    #
+    prePuller:
+      continuous:
+        enabled: true
     scheduling:
       userPlaceholder:
         enabled: true

From 2c93d9e571e420427a6741ad3cf16e039e4696c4 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Sun, 3 Jul 2022 16:52:40 +0200
Subject: [PATCH 27/43] jmte: configure nodeSelector for userPlaceholder pod

---
 config/clusters/jmte/prod.values.yaml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index 30fc22bace..baebefde14 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -5,7 +5,8 @@ basehub:
   jupyterhub:
     # Reverts changes in basehub configuration to the z2jh defaults and ensures
     # 1 pod is used as a placeholder pod, sized as the smallest node in the JMTE
-    # cluster.
+    # cluster. We also update singleuser.nodeSelector to ensure we default to
+    # have a placeholder for the smallest nodes only.
     #
     prePuller:
       continuous:
@@ -20,6 +21,13 @@ basehub:
             memory: 14G
 
     singleuser:
+      # This default value will be relevant for the userPlaceholder
+      # configuration, but irrelevant for the defaults we override in our
+      # profileList configuration.
+      #
+      nodeSelector:
+        2i2c.org/node-cpu: "4"
+
       # Eksctl: The service account was created by eksctl.
       #
       serviceAccountName: &user-sa s3-full-access

From 877419ce7c2a6661d06fa5f5f46101f679329945 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Sun, 3 Jul 2022 17:17:34 +0200
Subject: [PATCH 28/43] jmte: add env for the RDS db setup for jmte

---
 config/clusters/jmte/prod.values.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index baebefde14..d2dae9ad49 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -43,6 +43,17 @@ basehub:
         #
         SCRATCH_BUCKET: s3://jmte-scratch/$(JUPYTERHUB_USER)
         PANGEO_SCRATCH: s3://jmte-scratch/$(JUPYTERHUB_USER)
+        # An Amazon RDS postgresql 14 database server has been setup on a
+        # machine with 4 cores and 32 GB memory. See
+        # https://us-west-2.console.aws.amazon.com/rds/home?region=us-west-2#modify-instance:id=jmte-db.https://us-west-2.console.aws.amazon.com/rds/home?region=us-west-2#modify-instance:id=jmte-db
+        #
+        # I created a postgresql user and database for use by some like this:
+        #
+        # CREATE USER proj WITH ENCRYPTED PASSWORD '***';
+        # CREATE DATABASE proj;
+        # GRANT ALL PRIVILEGES ON DATABASE proj TO proj;
+        #
+        JMTE_DB_HOST: jmte-db.cqf1ngjal8bq.us-west-2.rds.amazonaws.com
 
       initContainers:
         # Need to explicitly fix ownership here, since EFS doesn't do anonuid

From febbd095b6cdbd2ffd8df54903cc3a794c370593 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Sat, 16 Jul 2022 15:45:21 +0200
Subject: [PATCH 29/43] jmte: ensure CUDA drivers propegate to containers

---
 config/clusters/jmte/common.values.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index c2abb33a43..baa7d097ea 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -81,6 +81,15 @@ basehub:
         #
         GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
         GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12
+        # NVIDIA_DRIVER_CAPABILITIES is added based on
+        # https://github.com/2i2c-org/infrastructure/pull/1314 and
+        # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#driver-capabilities
+        # that indicate this is needed.
+        #
+        # It was added when `nvidia-smi` didn't report a CUDA driver version,
+        # and no /usr/local/cuda folders were found in the container filesystem.
+        #
+        NVIDIA_DRIVER_CAPABILITIES: compute,utility
 
       image:
         # NOTE: We use the jupyterhub-configurator so this image/tag is not

From 18f0c825dc0166b4ebb116a55a2752a1f3f7bbab Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Sat, 16 Jul 2022 15:45:39 +0200
Subject: [PATCH 30/43] jmte: add debugging pod manifest for node fs

---
 debug-pod-node-fs.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 debug-pod-node-fs.yml

diff --git a/debug-pod-node-fs.yml b/debug-pod-node-fs.yml
new file mode 100644
index 0000000000..15d69ce045
--- /dev/null
+++ b/debug-pod-node-fs.yml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: node-fs-inspection
+spec:
+  containers:
+    - name: node-fs-inspection
+      image: ubuntu:22.04
+      command: ["sh", "-c", "sleep infinity"]
+      resources:
+        requests:
+          cpu: 225m
+          memory: "939524096"
+      volumeMounts:
+        - name: node-root-fs
+          mountPath: /node-root-fs
+  terminationGracePeriodSeconds: 1
+  nodeSelector:
+    2i2c.org/node-gpu: "1"
+  tolerations:
+    - effect: NoSchedule
+      key: hub.jupyter.org/dedicated
+      operator: Equal
+      value: user
+    - effect: NoSchedule
+      key: hub.jupyter.org_dedicated
+      operator: Equal
+      value: user
+  volumes:
+    - name: node-root-fs
+      hostPath:
+        path: /

From b7323f647769cbafa1fbc6500cd7664a74d6308f Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Mon, 18 Jul 2022 22:08:12 +0200
Subject: [PATCH 31/43] jmte: add k8s memory based emptyDir volume mount to
 /dev/shm

---
 config/clusters/jmte/common.values.yaml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index baa7d097ea..d91be0c245 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -58,6 +58,30 @@ basehub:
             # https://github.com/berkeley-dsep-infra/datahub/issues/3160
             ContentsManager:
               allow_hidden: true
+      # /dev/shm is mounted as a filesystem path, where writing to it means to
+      # write to memory.
+      #
+      # How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614
+      # Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389
+      #
+      storage:
+        extraVolumes:
+          - name: dev-shm
+            emptyDir:
+              medium: Memory
+        extraVolumeMounts:
+          - name: dev-shm
+            mountPath: /dev/shm
+          # FIXME: we override the list extraVolumeMounts which is also set in
+          #        the the basehub chart, due to that, we need to add this here
+          #        as well. An option is to add hub.extraConfig entries that
+          #        append the kubespawner configuration to include these extra
+          #        volume mounts.
+          #
+          - name: home
+            mountPath: /home/jovyan/shared
+            subPath: _shared
+            readOnly: true
 
       # Eksctl: The service account was created by eksctl.
       #

From 3e04accffe686bc40eb56ba6e0e78d2d436dd268 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Mon, 1 Aug 2022 23:34:40 +0200
Subject: [PATCH 32/43] jmte: increase spawn timeout as eks slowly pulls images

---
 config/clusters/jmte/common.values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index d91be0c245..efbc4b5443 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -91,7 +91,7 @@ basehub:
       # Increased as we have experienced a too slow image pull at least
       # once. Our pods can take ~6-7 minutes to start on a new node it
       # seems, so this gives us some margin.
-      startTimeout: 900
+      startTimeout: 1200
 
       extraEnv:
         # github-app-user-auth requires:

From 072d093274348a2b73f020b3b73a2c0aafe89954 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Mon, 5 Sep 2022 17:30:18 +0200
Subject: [PATCH 33/43] jmte: add another availability zone for highmem
 instances

---
 eksctl/eksctl-cluster-config.yaml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 24e788b9fc..98e43cd831 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -317,12 +317,19 @@ nodeGroups:
   # Note that we opted for us-west-2b here because g4dn machines were not
   # available in us-west-2d.
   #
+  #   aws ec2 describe-instance-type-offerings \
+  #     --region us-west-2 \
+  #     --filter Name=instance-type,Values=g4dn.xlarge \
+  #     --location-type=availability-zone
+  #
   # 57 pods, 4 cpu, 16 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
   - name: user-gpu-a-4
-    availabilityZones: &user-gpu-availabilityZones [us-west-2b]
+    availabilityZones: &user-gpu-availabilityZones [us-west-2a, us-west-2b]
     instanceType: g4dn.xlarge
     minSize: *user-minSize
-    maxSize: *user-maxSize
+    # maxSize increased to accommodate request by Facu that a workshop is to
+    # support 8 simultaneous users with GPU servers.
+    maxSize: 10
     desiredCapacity: *user-desiredCapacity
     volumeSize: *user-volumeSize
     labels:

From 2843ee5f12007aef3655c4d36ce27f32fb4bdc7d Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Mon, 5 Sep 2022 18:32:17 +0200
Subject: [PATCH 34/43] jmte: add shared-public folder

---
 config/clusters/jmte/common.values.yaml | 3 +++
 config/clusters/jmte/prod.values.yaml   | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index efbc4b5443..e0584a74ff 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -82,6 +82,9 @@ basehub:
             mountPath: /home/jovyan/shared
             subPath: _shared
             readOnly: true
+          - name: home
+            mountPath: /home/jovyan/shared-public
+            subPath: _shared_public
 
       # Eksctl: The service account was created by eksctl.
       #
diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index d2dae9ad49..d09ffe8c51 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -63,7 +63,7 @@ basehub:
             [
               "sh",
               "-c",
-              "id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan",
+              "id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan",
             ]
           securityContext:
             runAsUser: 0
@@ -74,6 +74,9 @@ basehub:
             - name: home
               mountPath: /home/jovyan/shared
               subPath: _shared
+            - name: home
+              mountPath: /home/jovyan/shared-public
+              subPath: _shared_public
 
     proxy:
       traefik:

From 923475f507b54be28b90022ddc0b01e29ef805aa Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 22 Sep 2022 22:49:23 +0200
Subject: [PATCH 35/43] Slim down the debug-pod

---
 debug-pod.yaml | 54 +++++++++++---------------------------------------
 1 file changed, 12 insertions(+), 42 deletions(-)

diff --git a/debug-pod.yaml b/debug-pod.yaml
index 53a6d76a7d..6fe9c5cc54 100644
--- a/debug-pod.yaml
+++ b/debug-pod.yaml
@@ -1,19 +1,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  annotations:
-    hub.jupyter.org/username: fperez
-  labels:
-    app: jupyterhub
-    chart: jupyterhub-1.1.1
-    component: singleuser-server
-    heritage: jupyterhub
-    hub.jupyter.org/network-access-hub: "true"
-    hub.jupyter.org/network-access-proxy-http: "true"
-    hub.jupyter.org/servername: ""
-    hub.jupyter.org/username: fperez
-    release: prod
-  name: jupyter-fperez-debugging
+  name: jupyter-debugging
   namespace: prod
 spec:
   affinity:
@@ -27,49 +15,31 @@ spec:
                   - user
           weight: 100
   containers:
-    - args:
-        - jupyterhub-singleuser
-        - --ip=0.0.0.0
-        - --port=8888
-        - --SingleUserNotebookApp.default_url=/lab
-      image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:c6d9558
-      name: notebook
-      ports:
-        - containerPort: 8888
-          name: notebook-port
-          protocol: TCP
-      resources:
-        requests:
-          cpu: 225m
-          memory: "939524096"
-  initContainers:
-    - command:
-        - sh
-        - -c
-        - id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan & sleep infinity
+    - name: busybox
       image: busybox
-      imagePullPolicy: Always
-      name: volume-mount-ownership-fix
+      command: ["sh", "-c", "sleep infinity"]
       securityContext:
         runAsUser: 0
       volumeMounts:
+        - mountPath: /nfs
+          name: home
         - mountPath: /home/jovyan
           name: home
           subPath: fperez
         - mountPath: /home/jovyan/shared
           name: home
           subPath: _shared
+        - mountPath: /home/jovyan/shared-public
+          name: home
+          subPath: _shared_public
+      resources:
+        requests:
+          cpu: 225m
+          memory: "939524096"
   nodeSelector:
     2i2c.org/node-cpu: "4"
-  priority: 0
-  priorityClassName: prod-default-priority
-  restartPolicy: OnFailure
   schedulerName: prod-user-scheduler
-  securityContext:
-    fsGroup: 100
-  serviceAccount: s3-full-access
   serviceAccountName: s3-full-access
-  terminationGracePeriodSeconds: 30
   tolerations:
     - effect: NoSchedule
       key: hub.jupyter.org/dedicated

From 9d1f700d5aac12a3980ab67dec9b9110278e8fb0 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik@sundellopensource.se>
Date: Thu, 22 Sep 2022 22:53:25 +0200
Subject: [PATCH 36/43] jmte: add test of latest image

---
 config/clusters/jmte/common.values.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index e0584a74ff..710069a639 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -192,6 +192,16 @@ basehub:
             node_selector: { 2i2c.org/node-cpu: "16", 2i2c.org/node-gpu: "1" }
             extra_resource_limits:
               nvidia.com/gpu: "1"
+        - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
+          description: "Helps us test an image before we make it the default"
+          kubespawner_override:
+            image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest
+            image_pull_policy: Always
+            cpu_guarantee: 0.225
+            mem_guarantee: 0.875G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "4" }
+            extra_resource_limits: {}
 
     proxy:
       # proxy notes:

From 4291da1c06afc01302219f76373b06f4b0a0667a Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Mon, 12 Dec 2022 10:47:30 +0100
Subject: [PATCH 37/43] jmte: add 256GB memory GPU node

---
 config/clusters/jmte/common.values.yaml |  8 +++++++
 eksctl/eksctl-cluster-config.yaml       | 28 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index 710069a639..fd2f539848 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -192,6 +192,14 @@ basehub:
             node_selector: { 2i2c.org/node-cpu: "16", 2i2c.org/node-gpu: "1" }
             extra_resource_limits:
               nvidia.com/gpu: "1"
+        - display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU"
+          description: "A dedicated machine for you with one GPU attached."
+          kubespawner_override:
+            mem_guarantee: 224G
+            mem_limit: null
+            node_selector: { 2i2c.org/node-cpu: "64", 2i2c.org/node-gpu: "1" }
+            extra_resource_limits:
+              nvidia.com/gpu: "1"
         - display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
           description: "Helps us test an image before we make it the default"
           kubespawner_override:
diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 98e43cd831..a9d6678b1f 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -322,7 +322,7 @@ nodeGroups:
   #     --filter Name=instance-type,Values=g4dn.xlarge \
   #     --location-type=availability-zone
   #
-  # 57 pods, 4 cpu, 16 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
+  # 28 pods, 4 cpu, 16 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
   - name: user-gpu-a-4
     availabilityZones: &user-gpu-availabilityZones [us-west-2a, us-west-2b]
     instanceType: g4dn.xlarge
@@ -352,7 +352,7 @@ nodeGroups:
         autoScaler: true
         efs: true
 
-  # 233 pods, 16 cpu, 64 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
+  # 28 pods, 16 cpu, 64 GB (Intel, 25 GBits network), 1 T4 Tensor Core GPU
   - name: user-gpu-a-16
     availabilityZones: *user-gpu-availabilityZones
     instanceType: g4dn.4xlarge
@@ -376,6 +376,30 @@ nodeGroups:
         autoScaler: true
         efs: true
 
+  # 57 pods, 64 cpu, 256 GB (Intel, 50 GBits network), 1 T4 Tensor Core GPU
+  - name: user-gpu-a-64
+    availabilityZones: *user-gpu-availabilityZones
+    instanceType: g4dn.16xlarge
+    minSize: *user-minSize
+    maxSize: *user-maxSize
+    desiredCapacity: *user-desiredCapacity
+    volumeSize: *user-volumeSize
+    labels:
+      hub.jupyter.org/node-purpose: user
+      2i2c.org/node-cpu: "64"
+      2i2c.org/node-gpu: "1"
+    taints:
+      hub.jupyter.org_dedicated: user:NoSchedule
+    tags:
+      k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "64"
+      k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
+      k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+    iam: &user-iam
+      withAddonPolicies:
+        autoScaler: true
+        efs: true
+
   # Worker node pools using cheaper spot instances that are temporary.
   #
   #   References:

From 8927e310bc3312758c1e00ab98a34507994a725f Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Dec 2022 15:30:32 +0100
Subject: [PATCH 38/43] jmte: update k8s from 1.22 to 1.24

---
 eksctl/eksctl-cluster-config.yaml | 131 ++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 15 deletions(-)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index a9d6678b1f..f66e5d1547 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -2,6 +2,10 @@
 # by the cluster.
 # ref: https://eksctl.io/usage/schema/
 #
+# Get cluster credentials:
+#
+#   eksctl utils write-kubeconfig --cluster=jmte
+#
 # Cluster operations:
 # ref: https://eksctl.io/usage/cluster-upgrade/
 #
@@ -51,45 +55,138 @@ metadata:
   #
   #    eksctl upgrade cluster --config-file eksctl-cluster-config.yaml --approve
   #
-  # 2. Deleted all non-core nodegroups
+  # 3. Deleted all non-core nodegroups
   #
   #    eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "user-*,worker-*" --approve
   #
-  # 3. Updated the version field in this config from 1.20 to 1.22
+  # 4. Updated the version field in this config from 1.20 to 1.22
   #
   #    - It is allowed to have a nodegroup +-2 minors away from the control plan version
   #
-  # 4. Created a new core nodepool (core-b)
+  # 5. Created a new core nodepool (core-b)
   #
   #    eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "core-b" --install-nvidia-plugin=false
   #
-  # 5. Deleted the old core nodepool (core-a)
+  # 6. Deleted the old core nodepool (core-a)
   #
-  #    eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "core-a" --approve
+  #    eksctl delete nodegroup --config-file=eksctl-cluster-config.yaml --include "core-b" --approve
   #
-  # 6. Upgraded add-ons (takes ~3*5s)
+  # 7. Upgraded add-ons (takes ~3*5s)
   #
   #    eksctl utils update-kube-proxy --cluster=jmte --approve
   #    eksctl utils update-aws-node --cluster=jmte --approve
   #    eksctl utils update-coredns --cluster=jmte --approve
   #
-  # 7. Update the version field in this config from 1.22 to 1.21
+  # 8. Update the version field in this config from 1.22 to 1.21
   #
-  # 8. Upgraded the control plane, as in step 2.
+  # 9. Upgraded the control plane, as in step 2.
   #
-  # 9. Upgraded add-ons, as in step 6.
+  # A. Upgraded add-ons, as in step 7.
   #
-  # A. Update the version field in this config from 1.21 to 1.22
+  # B. Update the version field in this config from 1.21 to 1.22
   #
-  # B. Upgraded the control plane, as in step 2.
+  # C. Upgraded the control plane, as in step 2.
   #
-  # C. Upgraded add-ons, as in step 6.
+  # D. Upgraded add-ons, as in step 7.
   #
-  # D. Recreated all nodegroups
+  # E. Recreated all nodegroups
   #
   #    eksctl create nodegroup --config-file=eksctl-cluster-config.yaml --include "*" --install-nvidia-plugin=false
   #
-  version: "1.22"
+  # For reference, this is the steps I took when upgrading from k8s 1.22 to k8s
+  # 1.24, Dec 18th 2022.
+  #
+  # 1. Performed step 1-7 from above to, but migrated control plane from 1.22 to
+  #    1.23 and node groups from 1.22 to 1.24.
+  #
+  # 2. When performing step 7:
+  #
+  #    - the aws-node daemonset's pods failed to start because of a too
+  #      restrictive container securityContext not running as root.
+  #    - the kube-proxy deamonset's pods failed to pull the image, it was not
+  #      found.
+  #
+  #    I patched the aws-node thing now, but went ahead with the upgrade to k8s
+  #    1.24 in the control plane, hoping another `eksctl utils update-aws-node`
+  #    and `eksctl utils update-kube-proxy` would resolve the issues.
+  #
+  #    Later I concluded the following:
+  #
+  #    - aws-node issue: https://github.com/weaveworks/eksctl/issues/6048.
+  #      Resolved by removing `runAsNonRoot: true` and
+  #      `allowPrivilegeEscalation: false`.
+  #    - kube-proxy issue: it went away when upgrading the plugin in 1.24
+  #    - the cluster-autoscaler failed to start initially, but made it in the
+  #      end when other pods got running.
+  #
+  # 3. I upgraded the control plan to 1.24 (step 2 above) and re-upgraded add-ons
+  #    (step 7 above).
+  #
+  # 4. I recreated all node groups as in step E above.
+  #
+  # 5. My hub pod entered a pending state because
+  #
+  #    - 1 node(s) had no available volume zone
+  #    - I think this is the issue:
+  #      https://docs.aws.amazon.com/eks/latest/userguide/ebs-csi.html, I
+  #      upgraded from v1.22 to v1.23+ without manually activating the plugin
+  #      mentioned there.
+  #    - Looking at
+  #      https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html
+  #      and running the command below, I conclude it was not active in my
+  #      cluster.
+  #
+  # 6. (what I should have done) Getting ebs-csi-driver setup:
+  #
+  #    What I think should have been done is to:
+  #
+  #    1. Ensure a service account was setup via this config:
+  #       https://eksctl.io/usage/schema/#iam-serviceAccounts-wellKnownPolicies-ebsCSIController
+  #    2. Ensure that the addon was setup via this config:
+  #       https://eksctl.io/usage/schema/#addons-wellKnownPolicies-ebsCSIController
+  #    3. Ensure that the node pools using ebs storage (core) was configured to use this:
+  #       https://eksctl.io/usage/schema/#nodeGroups-iam-withAddonPolicies-ebs
+  #
+  # 6. (what I actually did) Getting ebs-csi-driver setup:
+  #
+  #   I read the following instructions: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html#adding-ebs-csi-eks-add-on
+  #
+  #   I did pre-requisites to setup permissions via: https://docs.aws.amazon.com/eks/latest/userguide/csi-iam-role.html
+  #
+  #     UPDATE: I think this pre-requites step could be done via this config instead:
+  #             https://eksctl.io/usage/schema/#iam-serviceAccounts-wellKnownPolicies-ebsCSIController
+  #
+  #     eksctl get addon --name aws-ebs-csi-driver --cluster=jmte
+  #
+  #     eksctl create iamserviceaccount \
+  #         --name=ebs-csi-controller-sa \
+  #         --namespace=kube-system \
+  #         --cluster=jmte \
+  #         --attach-policy-arn=arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \
+  #         --approve \
+  #         --role-only \
+  #         --role-name=AmazonEKS_EBS_CSI_DriverRole
+  #
+  #   I verified I didn't have a EBS driver installed already:
+  #
+  #     eksctl get addon --name=aws-ebs-csi-driver --cluster=jmte
+  #
+  #   I added the ebs driver addon:
+  #
+  #     UPDATE: I think this main step could be done via this config instead:
+  #             https://eksctl.io/usage/schema/#addons-wellKnownPolicies-ebsCSIController
+  #
+  #     eksctl create addon --name=aws-ebs-csi-driver --cluster=jmte --service-account-role-arn=arn:aws:iam::286354552638:role/AmazonEKS_EBS_CSI_DriverRole --force
+  #
+  #   The hub pod that mounted a PVC with ebs storage and got "1 node(s) had no
+  #   available volume zone" was suddenly scheduled successfully!
+  #
+  #   I think maybe we could manage to setup eksctl clusters to directly have
+  #   this plugin via this config. For now, this was done with manual patches
+  #   though.
+  #
+
+  version: "1.24"
   tags:
     2i2c.org/project: jmte
 
@@ -158,7 +255,7 @@ iam:
 #   you have run into a quota issue. Following that, you make a request to AWS using provided link: https://aws.amazon.com/contact-us/ec2-request
 #
 nodeGroups:
-  - name: core-b
+  - name: core-a
     availabilityZones: [us-west-2d] # aws ec2 describe-availability-zones --region <region-name>
     instanceType: m5.large # 28 pods, 2 cpu, 8 GB
     minSize: 0
@@ -172,6 +269,10 @@ nodeGroups:
     iam:
       withAddonPolicies:
         autoScaler: true
+        # ebs: I'm not sure if this was needed because I added it before adding
+        #      the ebs csi driver which was absolutely needed. Maybe this and
+        #      the driver was needed.
+        ebs: true
         efs: true
 
   # 57 pods, 4 cpu, 16 GB (Intel, 10 GBits network)

From 1217e0a914e46c95b1692c3ceaf23cd64ad25373 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Dec 2022 15:56:51 +0100
Subject: [PATCH 39/43] basehub: add nfs.homeSpaceReporter.enabled

---
 helm-charts/basehub/templates/home-space-reporter.yaml | 4 +++-
 helm-charts/basehub/values.schema.yaml                 | 9 +++++++++
 helm-charts/basehub/values.yaml                        | 2 ++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/helm-charts/basehub/templates/home-space-reporter.yaml b/helm-charts/basehub/templates/home-space-reporter.yaml
index fcac3f453c..f7db2fad70 100644
--- a/helm-charts/basehub/templates/home-space-reporter.yaml
+++ b/helm-charts/basehub/templates/home-space-reporter.yaml
@@ -1,6 +1,7 @@
 # Deploy a prometheus node_exporter with the same home directory
 # we have for our hub mounted so we can monitor free space usage.
-{{- if or .Values.nfs.enabled .Values.azureFile.enabled }}
+{{- if .Values.nfs.homeSpaceReporter.enabled -}}
+{{- if or .Values.nfs.enabled .Values.azureFile.enabled -}}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -59,3 +60,4 @@ spec:
             claimName: home-nfs
             {{- end }}
 {{- end }}
+{{- end }}
diff --git a/helm-charts/basehub/values.schema.yaml b/helm-charts/basehub/values.schema.yaml
index 71e708bc09..bf8b01322d 100644
--- a/helm-charts/basehub/values.schema.yaml
+++ b/helm-charts/basehub/values.schema.yaml
@@ -168,6 +168,7 @@ properties:
     required:
       - enabled
       - shareCreator
+      - homeSpaceReporter
       - pv
     properties:
       enabled:
@@ -186,6 +187,14 @@ properties:
             items:
               type: object
               additionalProperties: true
+      homeSpaceReporter:
+        type: object
+        additionalProperties: false
+        required:
+          - enabled
+        properties:
+          enabled:
+            type: boolean
       pv:
         type: object
         additionalProperties: false
diff --git a/helm-charts/basehub/values.yaml b/helm-charts/basehub/values.yaml
index 71b574e67e..f50fc77e95 100644
--- a/helm-charts/basehub/values.yaml
+++ b/helm-charts/basehub/values.yaml
@@ -40,6 +40,8 @@ nfs:
   shareCreator:
     enabled: true
     tolerations: []
+  homeSpaceReporter:
+    enabled: true
   pv:
     mountOptions:
       - soft

From 5e429dd458dd0e11913875f7ea0b3b700ef8e3fa Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Dec 2022 15:57:04 +0100
Subject: [PATCH 40/43] jmte: disable nfs.homeSpaceReporter.enabled

---
 config/clusters/jmte/common.values.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index fd2f539848..29cd3c8ddc 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -6,6 +6,8 @@ basehub:
     # enabled: true
     shareCreator:
       enabled: true
+    homeSpaceReporter:
+      enabled: false
     pv:
       serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
       # mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html

From 275acbcc436d8378dfca8f6ecbdcea6aa540a8ef Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Dec 2022 17:03:27 +0100
Subject: [PATCH 41/43] jmte: remove old dask-gateway workaround

---
 config/clusters/jmte/common.values.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/config/clusters/jmte/common.values.yaml b/config/clusters/jmte/common.values.yaml
index 29cd3c8ddc..3d5a550910 100644
--- a/config/clusters/jmte/common.values.yaml
+++ b/config/clusters/jmte/common.values.yaml
@@ -291,10 +291,6 @@ basehub:
             - jonathan-taylor # Jonathan Taylor
           admin_users: *users
       allowNamedServers: true
-      networkPolicy:
-        # FIXME: Required for dask gateway 0.9.0. It is fixed but a Helm
-        #        chart of newer version is not yet released.
-        enabled: false
 
 dask-gateway:
   # dask-gateway notes:

From 5b4be24f693f9e7202816258d99f60ab5213c278 Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Sun, 18 Dec 2022 17:04:24 +0100
Subject: [PATCH 42/43] jmte: add proxy.htts.hosts explicitly as autohttps is
 used

---
 config/clusters/jmte/prod.values.yaml    |  3 +++
 config/clusters/jmte/staging.values.yaml |  5 +++++
 eksctl/eksctl-cluster-config.yaml        | 11 ++++++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/config/clusters/jmte/prod.values.yaml b/config/clusters/jmte/prod.values.yaml
index d09ffe8c51..fd26f5c9d7 100644
--- a/config/clusters/jmte/prod.values.yaml
+++ b/config/clusters/jmte/prod.values.yaml
@@ -79,6 +79,9 @@ basehub:
               subPath: _shared_public
 
     proxy:
+      https:
+        hosts:
+          - hub.jupytearth.org
       traefik:
         # jupyterhub-ssh/sftp integration part 3/3:
         #
diff --git a/config/clusters/jmte/staging.values.yaml b/config/clusters/jmte/staging.values.yaml
index 570bd8ebe2..0317f77053 100644
--- a/config/clusters/jmte/staging.values.yaml
+++ b/config/clusters/jmte/staging.values.yaml
@@ -24,6 +24,11 @@ basehub:
         #
         - --LabApp.collaborative=True
 
+    proxy:
+      https:
+        hosts:
+          - staging.hub.jupytearth.org
+
 jupyterhub-ssh:
   sftp:
     enabled: false
diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index f66e5d1547..82dcd31a11 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -185,7 +185,16 @@ metadata:
   #   this plugin via this config. For now, this was done with manual patches
   #   though.
   #
-
+  # 7. I realized the ingress -> service coupling didn't work, so
+  #    https://hub.jupytearth.org got stuck.
+  #
+  #    Resolution attempt failing: eksctl utils update-legacy-subnet-settings --cluster=jmte
+  #
+  #    Resolution attempt succeeded: I had also upgraded the deployer and ended
+  #    up without getting proxy.https.hosts set following this:
+  #    https://github.com/2i2c-org/infrastructure/pull/1404/commits/ec6f0aee616cb16d8b8e2e99252bb4110716b5d2#diff-eedaf02b81cd907a3feb5e4389e9825226bf7dc82a0fb582f9ad367c00ba6651L37,
+  #    by adding proxy.https.hosts things started working again.
+  #
   version: "1.24"
   tags:
     2i2c.org/project: jmte

From 27a45923b542615569d5fd758a61ad635e44c2fc Mon Sep 17 00:00:00 2001
From: Erik Sundell <erik.i.sundell@gmail.com>
Date: Fri, 27 Jan 2023 00:03:34 +0100
Subject: [PATCH 43/43] jmte: fix GPU nodes labels/taints (16/64 CPU variants
 was failing)

---
 eksctl/eksctl-cluster-config.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/eksctl/eksctl-cluster-config.yaml b/eksctl/eksctl-cluster-config.yaml
index 82dcd31a11..c5e0bbc53f 100644
--- a/eksctl/eksctl-cluster-config.yaml
+++ b/eksctl/eksctl-cluster-config.yaml
@@ -474,13 +474,17 @@ nodeGroups:
       hub.jupyter.org/node-purpose: user
       2i2c.org/node-cpu: "16"
       2i2c.org/node-gpu: "1"
+      k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
     taints:
       hub.jupyter.org_dedicated: user:NoSchedule
+      nvidia.com/gpu: NoSchedule
     tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
       k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "16"
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
       k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+      k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: NoSchedule
     iam: &user-iam
       withAddonPolicies:
         autoScaler: true
@@ -498,13 +502,17 @@ nodeGroups:
       hub.jupyter.org/node-purpose: user
       2i2c.org/node-cpu: "64"
       2i2c.org/node-gpu: "1"
+      k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
     taints:
       hub.jupyter.org_dedicated: user:NoSchedule
+      nvidia.com/gpu: NoSchedule
     tags:
+      k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator: "nvidia-tesla-t4"
       k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose: user
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-cpu: "64"
       k8s.io/cluster-autoscaler/node-template/label/2i2c.org/node-gpu: "1"
       k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org_dedicated: user:NoSchedule
+      k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu: NoSchedule
     iam: &user-iam
       withAddonPolicies:
         autoScaler: true