Skip to content

Commit

Permalink
Merge pull request #379 from yuvipanda/aws-kubeconfig
Browse files Browse the repository at this point in the history
Migrate Farallon Staging hub to this repository.
  • Loading branch information
yuvipanda authored May 7, 2021
2 parents 96616bf + 8464f1b commit 3e22399
Show file tree
Hide file tree
Showing 11 changed files with 321 additions and 63 deletions.
196 changes: 196 additions & 0 deletions config/hubs/farallon.cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
name: farallon
provider: kubeconfig
kubeconfig:
file: secrets/farallon.yaml
hubs:
- name: farallon-staging
domain: staging.farallon.2i2c.cloud
template: daskhub
auth0:
connection: github
config:
scratchBucket:
enabled: false
basehub:
nfsPVC:
nfs:
# from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
mountOptions:
- rsize=1048576
- wsize=1048576
- timeo=600
- soft # We pick soft over hard, so NFS lockups don't lead to hung processes
- retrans=2
- noresvport
serverIP: fs-7b129903.efs.us-east-2.amazonaws.com
baseShareName: /homes/
shareCreator:
tolerations:
- key: node-role.kubernetes.io/master
operator: "Exists"
effect: "NoSchedule"
jupyterhub:
homepage:
templateVars:
org:
name: Farallon Institute
logo_url: https://2i2c.org/media/logo.png
url: http://www.faralloninstitute.org/
designed_by:
name: 2i2c
url: https://2i2c.org
operated_by:
name: 2i2c
url: https://2i2c.org
funded_by:
name: Farallon Institute
urL: http://www.faralloninstitute.org/
singleuser:
initContainers:
# Need to explicitly fix ownership here, since EFS doesn't do anonuid
- name: volume-mount-ownership-fix
image: busybox
command: ["sh", "-c", "id && chown 1000:1000 /home/jovyan && ls -lhd /home/jovyan"]
securityContext:
runAsUser: 0
volumeMounts:
- name: home
mountPath: /home/jovyan
subPath: "{username}"
image:
name: 677861182063.dkr.ecr.us-east-2.amazonaws.com/2i2c-hub/user-image
tag: 9cd76f1
profileList:
# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes.
- display_name: "Default: m5.xlarge"
description: "~4CPUs & ~15GB RAM"
kubespawner_override:
# Expllicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
mem_limit: null
mem_guarantee: 14G
cpu_guarantee: 3
node_selector:
hub.jupyter.org/pool-name: notebook-m5-xlarge
- display_name: "Default: m5.2xlarge"
description: "~8CPUs & ~30GB RAM"
kubespawner_override:
# Expllicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
mem_limit: null
mem_guarantee: 28G
cpu_guarantee: 7
node_selector:
hub.jupyter.org/pool-name: notebook-m5-2xlarge
scheduling:
userPlaceholder:
enabled: false
replicas: 0
userScheduler:
enabled: false
proxy:
service:
type: LoadBalancer
https:
enabled: true
hosts:
- staging.farallon.2i2c.cloud
chp:
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
traefik:
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
hub:
allowNamedServers: true
networkPolicy:
# FIXME: For dask gateway
enabled: false
readinessProbe:
enabled: false
nodeSelector: {}
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
dask-gateway:
traefik:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
controller:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
gateway:
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
backend:
scheduler:
extraPodConfig:
nodeSelector:
hub.jupyter.org/pool-name: dask-worker
tolerations:
- key: "k8s.dask.org/dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
- key: "k8s.dask.org_dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
worker:
extraPodConfig:
nodeSelector:
hub.jupyter.org/pool-name: dask-worker
tolerations:
- key: "k8s.dask.org/dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
- key: "k8s.dask.org_dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"

# TODO: figure out a replacement for userLimits.
extraConfig:
optionHandler: |
from dask_gateway_server.options import Options, Integer, Float, String
def cluster_options(user):
def option_handler(options):
if ":" not in options.image:
raise ValueError("When specifying an image you must also provide a tag")
extra_annotations = {
"hub.jupyter.org/username": user.name,
"prometheus.io/scrape": "true",
"prometheus.io/port": "8787",
}
extra_labels = {
"hub.jupyter.org/username": user.name,
}
return {
"worker_cores_limit": options.worker_cores,
"worker_cores": min(options.worker_cores / 2, 1),
"worker_memory": "%fG" % options.worker_memory,
"image": options.image,
"scheduler_extra_pod_annotations": extra_annotations,
"worker_extra_pod_annotations": extra_annotations,
"scheduler_extra_pod_labels": extra_labels,
"worker_extra_pod_labels": extra_labels,
}
return Options(
Integer("worker_cores", 2, min=1, max=16, label="Worker Cores"),
Float("worker_memory", 4, min=1, max=32, label="Worker Memory (GiB)"),
String("image", default="pangeo/pangeo-notebook:latest", label="Image"),
handler=option_handler,
)
c.Backend.cluster_options = cluster_options
idle: |
# timeout after 30 minutes of inactivity
c.KubeClusterConfig.idle_timeout = 1800
16 changes: 15 additions & 1 deletion config/hubs/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,23 @@ properties:
type: string
description: |
Cloud provider this cluster is running on. Used to perform
authentication against the cluster. Currently supports gcp.
authentication against the cluster. Currently supports gcp
and raw kubeconfig files.
enum:
- gcp
- kubeconfig
kubeconfig:
type: object
description: |
Configuration to connect to a cluster purely via a kubeconfig
file.
additionalProperties: false
properties:
file:
type: string
descriptiON: |
Path to kubeconfig file (encrypted with sops) to use for
connecting to the cluster
gcp:
type: object
additionalProperties: false
Expand Down
54 changes: 36 additions & 18 deletions deployer/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,30 @@ def build_image(self):

@contextmanager
def auth(self):
with tempfile.NamedTemporaryFile() as kubeconfig:
# FIXME: This is dumb
os.environ['KUBECONFIG'] = kubeconfig.name
assert self.spec['provider'] == 'gcp'

if self.spec['provider'] == 'gcp':
yield from self.auth_gcp()
elif self.spec['provider'] == 'kubeconfig':
yield from self.auth_kubeconfig()
else:
raise ValueError(f'Provider {self.spec["provider"]} not supported')


def auth_kubeconfig(self):
"""
Context manager for authenticating with just a kubeconfig file
For the duration of the contextmanager, we:
1. Decrypt the file specified in kubeconfig.file with sops
2. Set `KUBECONFIG` env var to our decrypted file path, so applications
we call (primarily helm) will use that as config
"""
config = self.spec['kubeconfig']
config_path = config['file']

with decrypt_file(config_path) as decrypted_key_path:
# FIXME: Unset this after our yield
os.environ['KUBECONFIG'] = decrypted_key_path
yield

def auth_gcp(self):
config = self.spec['gcp']
Expand All @@ -52,23 +70,23 @@ def auth_gcp(self):
# Else, it'll just have a `zone` key set. Let's respect either.
location = config.get('zone', config.get('region'))
cluster = config['cluster']
with tempfile.NamedTemporaryFile() as kubeconfig:
with decrypt_file(key_path) as decrypted_key_path:
subprocess.check_call([
'gcloud', 'auth',
'activate-service-account',
'--key-file', os.path.abspath(decrypted_key_path)
])

with decrypt_file(key_path) as decrypted_key_path:
subprocess.check_call([
'gcloud', 'auth',
'activate-service-account',
'--key-file', os.path.abspath(decrypted_key_path)
'gcloud', 'container', 'clusters',
# --zone works with regions too
f'--zone={location}',
f'--project={project}',
'get-credentials', cluster
])

subprocess.check_call([
'gcloud', 'container', 'clusters',
# --zone works with regions too
f'--zone={location}',
f'--project={project}',
'get-credentials', cluster
])

yield
yield


class Hub:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,4 @@ spec:
apiVersion: resourcemanager.cnrm.cloud.google.com/v1beta1
kind: Project
external: projects/{{ .Values.jupyterhub.cloudResources.gcp.projectId }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
iam.gke.io/gcp-service-account: {{ include "cloudResources.gcp.serviceAccountName" .}}@{{ .Values.jupyterhub.cloudResources.gcp.projectId }}.iam.gserviceaccount.com
name: user-sa
{{- end }}
5 changes: 1 addition & 4 deletions hub-templates/basehub/templates/nfs-pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ spec:
nfs:
server: {{ .Values.nfsPVC.nfs.serverIP | quote}}
path: "{{ .Values.nfsPVC.nfs.baseShareName }}{{ .Release.Name }}"
mountOptions:
- soft
- noatime
- vers=4.2
mountOptions: {{ .Values.nfsPVC.nfs.mountOptions | toJson }}
---
apiVersion: v1
kind: PersistentVolumeClaim
Expand Down
2 changes: 2 additions & 0 deletions hub-templates/basehub/templates/nfs-share-creator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ spec:
spec:
restartPolicy: Never
terminationGracePeriodSeconds: 0
tolerations: {{ .Values.nfsPVC.shareCreator.tolerations | toJson }}

containers:
- name: dummy
image: busybox
Expand Down
10 changes: 10 additions & 0 deletions hub-templates/basehub/templates/user-sa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
{{ if .Values.jupyterhub.cloudResources.scratchBucket.enabled}}
{{ if eq .Values.jupyterhub.cloudResources.provider "gcp" }}
iam.gke.io/gcp-service-account: {{ include "cloudResources.gcp.serviceAccountName" .}}@{{ .Values.jupyterhub.cloudResources.gcp.projectId }}.iam.gserviceaccount.com
{{- end }}
{{- end }}
name: user-sa
4 changes: 4 additions & 0 deletions hub-templates/basehub/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ nfsPVC:
shareCreator:
tolerations: []
nfs:
mountOptions:
- soft
- noatime
- vers=4.2
serverIP: nfs-server-01
# MUST HAVE TRAILING SLASH
baseShareName: /export/home-01/homes/
Expand Down
Loading

0 comments on commit 3e22399

Please sign in to comment.