-
Notifications
You must be signed in to change notification settings - Fork 65
/
values.yaml
1663 lines (1539 loc) · 71.4 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# We define a service account that is attached by default to all Jupyter user pods
# and dask-gateway workers. By default, this has no permissions - although extra
# cloud access permissions may be granted - see docs/topic/features.md.
userServiceAccount:
enabled: true
annotations: {}
adminServiceAccount:
enabled: false
annotations: {}
binderhub-service:
enabled: false
ingress:
enabled: false
ingressClassName: nginx
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 256m
cert-manager.io/cluster-issuer: letsencrypt-prod
nodeSelector:
hub.jupyter.org/node-purpose: core
service:
port: 8090
# The DaemonSet at https://github.com/2i2c-org/binderhub-service/blob/main/binderhub-service/templates/docker-api/daemonset.yaml
# will start a docker-api pod on a user node.
# It starts the [dockerd](https://docs.docker.com/engine/reference/commandline/dockerd/) daemon,
# that will be accessible via a unix socket, mounted by the build.
# The docker-api pod must run on the same node as the builder pods.
dockerApi:
nodeSelector:
hub.jupyter.org/node-purpose: user
tolerations:
# Tolerate tainted jupyterhub user nodes
- key: hub.jupyter.org_dedicated
value: user
effect: NoSchedule
- key: hub.jupyter.org/dedicated
value: user
effect: NoSchedule
config:
BinderHub:
base_url: /services/binder
use_registry: true
KubernetesBuildExecutor:
node_selector:
# Schedule builder pods to run on user nodes only
hub.jupyter.org/node-purpose: user
custom:
sendLogsOfLaunchEventsTo2i2c: false
extraConfig:
01-send-logs-of-launch-events-to-2i2c: |
if get_chart_config("custom.sendLogsOfLaunchEventsTo2i2c"):
import os
import sys
from traitlets.log import get_logger
# this check would ideally be done via chart config schema validation,
# but it may be too messy to do in practice - maybe not though
if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
get_logger().critical("binderhub-service.custom.sendLogsOfLaunchEventsTo2i2c requires binderhub-service.extraCredentials.googleServiceAccountKey to be setup")
sys.exit(1)
from google.cloud.logging import Client
from google.cloud.logging.handlers import CloudLoggingHandler
def _make_eventsink_handler(el):
client = Client()
log_name = "binderhub-event-logs"
get_logger().info(f"Sending logs of launch events to a 2i2c managed GCP project {client.project} under log name {log_name}.")
return [CloudLoggingHandler(client, name=log_name)]
c.EventLog.handlers_maker = _make_eventsink_handler
ingressBasicAuth:
enabled: false
# Primarily here for validation to 'work',
# as these are set in secret config otherwise. I don't like this,
# as we won't catch these values missing if they aren't set.
username: ""
password: ""
dex:
enabled: false
staticWebsite:
enabled: false
source:
git:
branch: main
githubAuth:
enabled: false
githubApp:
# Primarily here for validation to 'work',
# as these are set in secret config otherwise. I don't like this,
# as we won't catch these values missing if they aren't set.
id: 0
privateKey: ""
dask-gateway:
enabled: false # Enabling dask-gateway will install Dask Gateway as a dependency.
# Further Dask Gateway configuration goes here
# See https://github.com/dask/dask-gateway/blob/main/resources/helm/dask-gateway/values.yaml
gateway:
backend:
scheduler:
extraPodConfig:
serviceAccountName: user-sa
tolerations:
# Let's put schedulers on notebook nodes, since they aren't ephemeral
# dask can recover from dead workers, but not dead schedulers
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
- key: "hub.jupyter.org_dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
nodeSelector:
k8s.dask.org/node-purpose: scheduler
cores:
request: 0.01
limit: 1
memory:
request: 128M
limit: 1G
worker:
extraContainerConfig:
securityContext:
runAsGroup: 1000
runAsUser: 1000
extraPodConfig:
serviceAccountName: user-sa
securityContext:
fsGroup: 1000
tolerations:
- key: "k8s.dask.org/dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
- key: "k8s.dask.org_dedicated"
operator: "Equal"
value: "worker"
effect: "NoSchedule"
nodeSelector:
# Dask workers get their own pre-emptible pool
k8s.dask.org/node-purpose: worker
env:
- name: BASEHUB_K8S_DIST
valueFrom:
configMapKeyRef:
name: basehub-cluster-info
key: K8S_DIST
extraConfig:
# This configuration represents options that can be presented to users
# that want to create a Dask cluster using dask-gateway client.
#
# This configuration is meant to enable the user to request dask worker
# pods that fits well on 2i2c's clusters. Currently the only kind of
# instance types used are n2-highmem-16 or r5.4xlarge.
#
# - Documentation about exposing cluster options to users:
# https://gateway.dask.org/cluster-options.html and the
# - Reference for KubeClusterConfig, which is what can be configured:
# https://gateway.dask.org/api-server.html#kubeclusterconfig.
#
optionHandler: |
import os
import string
from dask_gateway_server.options import Integer, Mapping, Options, Select, String
# Escape a string to be dns-safe in the same way that KubeSpawner does it.
# Reference https://github.com/jupyterhub/kubespawner/blob/616f72c4aee26c3d2127c6af6086ec50d6cda383/kubespawner/spawner.py#L1828-L1835
# Adapted from https://github.com/minrk/escapism to avoid installing the package
# in the dask-gateway api pod which would have been problematic.
def escape_string_label_safe(to_escape):
safe_chars = set(string.ascii_lowercase + string.digits)
escape_char = "-"
chars = []
for c in to_escape:
if c in safe_chars:
chars.append(c)
else:
# escape one character
buf = []
# UTF-8 uses 1 to 4 bytes per character, depending on the Unicode symbol
# so we need to transform each byte to its hex value
for byte in c.encode("utf8"):
buf.append(escape_char)
# %X is the hex value of the byte
buf.append('%X' % byte)
escaped_hex_char = "".join(buf)
chars.append(escaped_hex_char)
return u''.join(chars)
# Decide on available instance types and their resource allocation
# choices to expose based on cloud provider. For each daskhub hub
# managed by 2i2c, there should be these instance types available.
#
cloud_provider = os.environ["BASEHUB_K8S_DIST"] # gke, eks, or aks
instance_types = {
"gke": ["n2-highmem-16"],
"eks": ["r5.4xlarge"],
"aks": ["Standard_E16_v4"],
}
# NOTE: Data mentioned below comes from manual inspection of data
# collected and currently only available at
# https://github.com/2i2c-org/infrastructure/pull/3337.
#
resource_allocations = {
# n2-highmem-16 nodes in our clusters have 15.89 allocatable cores
# and 116.549Gi allocatable memory, and daemonset are expected to
# not add more than 400m cores and 800Mi (0.781Gi) memory with some
# margin, so we get 15.49 cores and 115.768Gi available for worker
# pods to request.
#
# This is an initial conservative strategy, allowing a slight
# oversubscription of CPU but not any oversubscription of memory.
#
# To workaround https://github.com/dask/dask-gateway/issues/765, we
# round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49]
# to [0.9, 1.9, 3.8, 7.7, 15.4].
#
"n2-highmem-16": {
"1CPU, 7.2Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.235G", "worker_memory_limit": "7.235G"},
"2CPU, 14.5Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "14.471G", "worker_memory_limit": "14.471G"},
"4CPU, 28.9Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "28.942G", "worker_memory_limit": "28.942G"},
"8CPU, 57.9Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "57.884G", "worker_memory_limit": "57.884G"},
"16CPU, 115.8Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "115.768G", "worker_memory_limit": "115.768G"},
},
# r5.4xlarge nodes in our clusters have 15.89 allocatable cores and
# 121.504Gi allocatable memory, and daemonset are expected to not
# add more than 400m cores and 800Mi (0.781Gi) memory with some
# margin, so we get 15.49 cores and 120.723Gi available for worker
# pods to request.
#
# This is an initial conservative strategy, allowing a slight
# oversubscription of CPU but not any oversubscription of memory.
#
# To workaround https://github.com/dask/dask-gateway/issues/765, we
# round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49]
# to [0.9, 1.9, 3.8, 7.7, 15.4].
#
"r5.4xlarge": {
"1CPU, 7.5Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.545G", "worker_memory_limit": "7.545G"},
"2CPU, 15.1Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "15.090G", "worker_memory_limit": "15.090G"},
"4CPU, 30.2Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "30.180G", "worker_memory_limit": "30.180G"},
"8CPU, 60.4Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "60.361G", "worker_memory_limit": "60.361G"},
"16CPU, 120.7Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "120.723G", "worker_memory_limit": "120.723G"},
},
"Standard_E16_v4": {
# Set up to be proportioate, so using all the RAM uses all the CPU too
".25-1 CPU, 2GB RAM": {"worker_cores": 0.25, "worker_cores_limit": 1, "worker_memory": "2G", "worker_memory_limit": "2G"},
},
}
# for now we support only on one instance type per cluster, listing it
# as an option is a way to help convey how things work a bit better
it = instance_types[cloud_provider][0]
ra = resource_allocations[it]
ra_keys = list(ra.keys())
def cluster_options(user):
def option_handler(options):
if ":" not in options.image:
raise ValueError("When specifying an image you must also provide a tag")
extra_labels = {
"hub.jupyter.org/username": escape_string_label_safe(user.name),
}
scheduler_extra_pod_annotations = {
"hub.jupyter.org/username": user.name,
"prometheus.io/scrape": "true",
"prometheus.io/port": "8787",
}
worker_extra_pod_annotations = {
"hub.jupyter.org/username": user.name,
}
picked_ra = ra[options.worker_resource_allocation]
return {
# A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
"image": options.image,
"scheduler_extra_pod_labels": extra_labels,
"scheduler_extra_pod_annotations": scheduler_extra_pod_annotations,
"worker_extra_pod_labels": extra_labels,
"worker_extra_pod_annotations": worker_extra_pod_annotations,
"worker_cores": picked_ra["worker_cores"],
"worker_cores_limit": picked_ra["worker_cores_limit"],
"worker_memory": picked_ra["worker_memory"],
"worker_memory_limit": picked_ra["worker_memory_limit"],
"environment": options.environment,
"idle_timeout": options.idle_timeout_minutes * 60,
}
return Options(
Select(
"instance_type",
[it],
default=it,
label="Instance type running worker containers",
),
Select(
"worker_resource_allocation",
ra_keys,
default=ra_keys[0],
label="Resources per worker container",
),
# The default image is pre-specified by the dask-gateway client
# via the env var DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE set on
# the jupyterhub user pods
String("image", label="Image"),
Mapping("environment", {}, label="Environment variables (YAML)"),
Integer("idle_timeout_minutes", 30, min=0, label="Idle cluster terminated after (minutes)"),
handler=option_handler,
)
c.Backend.cluster_options = cluster_options
# timeout after 30 minutes of inactivity by default, keep this in sync
# with the user exposed option idle_timeout_minutes's default value
# configured above
c.KubeClusterConfig.idle_timeout = 1800
prefix: "/services/dask-gateway" # Users connect to the Gateway through the JupyterHub service.
auth:
type: jupyterhub # Use JupyterHub to authenticate with Dask Gateway
traefik:
nodeSelector:
k8s.dask.org/node-purpose: core
service:
type: ClusterIP # Access Dask Gateway through JupyterHub. To access the Gateway from outside JupyterHub, this must be changed to a `LoadBalancer`.
nfs:
enabled: false
dirsizeReporter:
enabled: true
volumeReporter:
enabled: true
shareCreator:
enabled: true
tolerations: []
pv:
enabled: false
mountOptions:
- soft
- noatime
- vers=4.2
# Use NFS provided by an in cluster server with the nfs-external-provisioner chart
inClusterNFS:
enabled: false
size: 100Gi
# A placeholder as global values that can be referenced from the same location
# of any chart should be possible to provide, but aren't necessarily provided or
# used.
global: {}
jupyterhub:
cull:
# Don't allow any user pods to run for longer than 7 days by default
maxAge: 604800 # 7 days in seconds
custom:
auth:
anonymizeUsername: false
singleuser:
extraPVCs: []
singleuserAdmin:
extraEnv: {}
extraVolumeMounts:
# IMPORTANT: What is added to this list is copied to other locations
# that wants to add an element to this list. This is done
# because when Helm config files are merged, lists get
# replaced rather than appended. So, if this is to be
# updated, we should update all those copies as well. An easy
# to way find such copies is to search for "singleuserAdmin:"
# in this repo.
#
- name: home
mountPath: /home/jovyan/shared-readwrite
subPath: _shared
- name: home
mountPath: /home/rstudio/shared-readwrite
subPath: _shared
2i2c:
# Should 2i2c engineering staff user IDs be injected to the admin_users
# configuration of the JupyterHub's authenticator by our custom
# jupyterhub_config.py snippet as declared in hub.extraConfig?
add_staff_user_ids_to_admin_users: false
add_staff_user_ids_of_type: ""
staff_github_ids:
- agoose77
- AIDEA775
- choldgraf
- colliand
- consideRatio
- damianavila
- GeorgianaElena
- Gman0909
- haroldcampbell
- jmunroe
- jnywong
- sgibson91
- yuvipanda
staff_google_ids:
homepage:
gitRepoUrl: "https://github.com/2i2c-org/default-hub-homepage"
# TODO: make main the default branch in the repo above
gitRepoBranch: "master"
templateVars:
enabled: true
jupyterhubConfigurator:
enabled: true
ingress:
enabled: true
ingressClassName: nginx
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 256m
cert-manager.io/cluster-issuer: letsencrypt-prod
scheduling:
# We declare matchNodePurpose=require to get a nodeAffinity like a
# nodeSelector on all core pods and user pods. core pods like hub and proxy
# will schedule on nodes with hub.jupyter.org/node-purpose=core and user
# pods on nodes with hub.jupyter.org/node-purpose=user.
#
# Since this setting adds a nodeAffinity, its okay that we configure
# KubeSpawner's profile_list to override node_selector.
#
corePods:
nodeAffinity:
matchNodePurpose: require
userPods:
nodeAffinity:
matchNodePurpose: require
podPriority:
enabled: true
userPlaceholder:
enabled: true
replicas: 0
userScheduler:
enabled: false
# replicas default value is 2, but having the user-scheduler run HA is an
# almost never practically helpful but has been found to increase cloud
# costs in clusters with many hubs. For additional discussion reducing
# this to 1, see https://github.com/2i2c-org/infrastructure/issues/3865.
replicas: 1
# FIXME: We should think about these resource requests/limits, see
# https://github.com/2i2c-org/infrastructure/issues/2127.
#
resources:
requests:
cpu: 0.01
memory: 64Mi
limits:
memory: 1G
# prePuller is about pulling a one or more images identified via chart
# configuration, including singleuser.image, singleuser.profileList entries
# with a dedicated image, but not profileList entries with images' specified
# via profile_options.
prePuller:
# continuous prePuller leads to the creation of a DaemonSet that starts a
# pod on each node to pull images.
#
# It is disabled as its only relevant for nodes started before user pods
# gets scheduled on them, in other cases it could delay startup and isn't
# expected to reduce startup times.
#
continuous:
enabled: false
# hook prePuller leads to the creation of a temporary DaemonSet and a pod
# awaiting pulling to complete before `helm upgrade` starts its main work.
#
# It is disabled as it adds notable complexity for a smaller benefit when
# correctly adopted. The added complexity includes:
#
# - risk of misconfiguration making image pulls not actually needed
# - risk of broken expectations and additional cognitive load
# - risk of causing significantly longer `helm upgrade` commands slowing
# down our CI system
# - ClusterRoleBinding resources are needed for the image-awaiter Pod
# involved, a resource that requires the highest k8s cluster permission
# otherwise possibly not needed to deploy basehub
#
hook:
enabled: false
proxy:
service:
type: ClusterIP
chp:
# FIXME: We should think about these resource requests/limits, see
# https://github.com/2i2c-org/infrastructure/issues/2127.
#
resources:
requests:
cpu: 0.01
memory: 64Mi
limits:
memory: 1Gi
traefik:
# FIXME: We should think about these resource requests/limits, see
# https://github.com/2i2c-org/infrastructure/issues/2127.
#
# Note if autohttps pod's aren't used anywhere by our basehub
# deployments, we should simply remove this traefik configuration.
#
resources:
requests:
memory: 64Mi
limits:
memory: 1Gi
singleuser:
# basehub creates a k8s ServiceAccount for the hubs users that isn't granted
# permissions to the k8s api-server or other resources by default. Cloud
# infra permissions can be granted to all users by declaring annotations on
# this k8s ServiceAccount via basehub config userServiceAccount.annotations.
serviceAccountName: user-sa
# Need to explicitly fix ownership here, as otherwise these directories will be owned
# by root on most NFS filesystems - neither EFS nor Google Filestore support anonuid
#
# This has to be done _once_ for each directory we mount _from_ the NFS
# server. We do it all the time since we don't know for sure it has been done once
# already.
#
# Note that we don't have to chown both the shared and shared-readwrite
# folder since they are both mounting the same folder on the NFS server.
#
# For details about this, see notes at:
# - https://github.com/2i2c-org/infrastructure/issues/2953#issuecomment-1672025545
# - https://github.com/2i2c-org/infrastructure/issues/2946#issuecomment-1671691248
#
initContainers:
- name: volume-mount-ownership-fix
image: busybox:1.36.1
command:
- sh
- -c
- id && chown 1000:1000 /home/jovyan /home/jovyan/shared && ls -lhd /home/jovyan
securityContext:
runAsUser: 0
volumeMounts:
- name: home
mountPath: /home/jovyan
subPath: "{username}"
# Mounted without readonly attribute here,
# so we can chown it appropriately
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
cmd:
# Mitigate a vulnerability in jupyter-server-proxy version <4.1.1, see
# https://github.com/jupyterhub/jupyter-server-proxy/security/advisories/GHSA-w3vc-fx9p-wp4v
# for more details.
- /mnt/ghsa-w3vc-fx9p-wp4v/check-patch-run
- jupyterhub-singleuser
extraEnv:
# notebook writes secure files that don't need to survive a
# restart here. Writing 'secure' files on some file systems (like
# Azure Files with SMB) seems buggy, so we just put runtime dir on
# /tmp. This is ok in our case, since no two users are on the same
# container.
JUPYTER_RUNTIME_DIR: /tmp/.jupyter-runtime
# By default, /bin/sh is used as shell for terminals, not /bin/bash
# Most people do not expect this, so let's match expectation
SHELL: /bin/bash
extraFiles:
ghsa-w3vc-fx9p-wp4v-check-patch-run:
mountPath: /mnt/ghsa-w3vc-fx9p-wp4v/check-patch-run
mode: 0755
stringData: |
#!/usr/bin/env python3
"""
This script is designed to check for and conditionally patch GHSA-w3vc-fx9p-wp4v
in user servers started by a JupyterHub. The script will execute any command
passed via arguments if provided, allowing it to wrap a user server startup call
to `jupyterhub-singleuser` for example.
Script adjustments:
- UPGRADE_IF_VULNERABLE
- ERROR_IF_VULNERABLE
Script patching assumptions:
- script is run before the jupyter server starts
- pip is available
- pip has sufficient filesystem permissions to upgrade jupyter-server-proxy
Read more at https://github.com/jupyterhub/jupyter-server-proxy/security/advisories/GHSA-w3vc-fx9p-wp4v.
"""
import os
import subprocess
import sys
# adjust these to meet vulnerability mitigation needs
UPGRADE_IF_VULNERABLE = True
ERROR_IF_VULNERABLE = False
def check_vuln():
"""
Checks for the vulnerability by looking to see if __version__ is available
as it coincides with the patched versions (3.2.3 and 4.1.1).
"""
try:
import jupyter_server_proxy
return False if hasattr(jupyter_server_proxy, "__version__") else True
except:
return False
def get_version_specifier():
"""
Returns a pip version specifier for use with `--no-deps` meant to do as
little as possible besides patching the vulnerability and remaining
functional.
"""
old = ["jupyter-server-proxy>=3.2.3,<4"]
new = ["jupyter-server-proxy>=4.1.1,<5", "simpervisor>=1,<2"]
try:
if sys.version_info < (3, 8):
return old
from importlib.metadata import version
jsp_version = version("jupyter-server-proxy")
if int(jsp_version.split(".")[0]) < 4:
return old
except:
pass
return new
def patch_vuln():
"""
Attempts to patch the vulnerability by upgrading jupyter-server-proxy using
pip.
"""
# attempt upgrade via pip, takes ~4 seconds
proc = subprocess.run(
[sys.executable, "-m", "pip", "--version"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
pip_available = proc.returncode == 0
if pip_available:
proc = subprocess.run(
[sys.executable, "-m", "pip", "install", "--no-deps"]
+ get_version_specifier()
)
if proc.returncode == 0:
return True
return False
def main():
if check_vuln():
warning_or_error = (
"ERROR" if ERROR_IF_VULNERABLE and not UPGRADE_IF_VULNERABLE else "WARNING"
)
print(
f"{warning_or_error}: jupyter-server-proxy __is vulnerable__ to GHSA-w3vc-fx9p-wp4v, see "
"https://github.com/jupyterhub/jupyter-server-proxy/security/advisories/GHSA-w3vc-fx9p-wp4v.",
flush=True,
)
if warning_or_error == "ERROR":
sys.exit(1)
if UPGRADE_IF_VULNERABLE:
print(
"INFO: Attempting to upgrade jupyter-server-proxy using pip...",
flush=True,
)
if patch_vuln():
print(
"INFO: Attempt to upgrade jupyter-server-proxy succeeded!",
flush=True,
)
else:
warning_or_error = "ERROR" if ERROR_IF_VULNERABLE else "WARNING"
print(
f"{warning_or_error}: Attempt to upgrade jupyter-server-proxy failed!",
flush=True,
)
if warning_or_error == "ERROR":
sys.exit(1)
if len(sys.argv) >= 2:
print("INFO: Executing provided command", flush=True)
os.execvp(sys.argv[1], sys.argv[1:])
else:
print("INFO: No command to execute provided", flush=True)
main()
ipython_kernel_config.json:
mountPath: /usr/local/etc/ipython/ipython_kernel_config.json
data:
# This keeps a history of all executed code under $HOME, which is almost always on
# NFS. This file is kept as a sqlite file, and sqlite and NFS do not go together very
# well! Disable this to save ourselves from debugging random NFS oddities that are caused
# by this unholy sqlite + NFS mixture.
HistoryManager:
enabled: false
# jupyter_server and notebook are different jupyter servers providing
# similar configuration options. Since we have user images that may
# provide either, we provide the same configuration for both via
# jupyter_server_config.json and jupyter_notebook_config.json.
#
# A hub can force a choice with singleuser.extraEnv via:
#
# JUPYTERHUB_SINGLEUSER_APP: "notebook.notebookapp.NotebookApp"
# JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp"
#
jupyter_server_config.json:
mountPath: /usr/local/etc/jupyter/jupyter_server_config.json
# if a user leaves a notebook with a running kernel,
# the effective idle timeout will typically be cull idle timeout
# of the server + the cull idle timeout of the kernel,
# as culling the kernel will register activity,
# resetting the no_activity timer for the server as a whole
data:
# Allow JupyterLab to show the 'View -> Show Hidden Files' option
# in the menu. Defaults are not changed.
# https://github.com/jupyterlab/jupyterlab/issues/11304#issuecomment-945466766
ContentsManager:
allow_hidden: true
# MappingKernelManager configuration reference:
# https://jupyter-server.readthedocs.io/en/latest/api/jupyter_server.services.kernels.html#jupyter_server.services.kernels.kernelmanager.MappingKernelManager
#
MappingKernelManager: &server_config_mapping_kernel_manager
cull_idle_timeout: 3600
cull_interval: 300
cull_connected: true
# ServerApp configuration reference:
# https://jupyter-server.readthedocs.io/en/latest/api/jupyter_server.html#jupyter_server.serverapp.ServerApp
#
ServerApp: &server_config_server_app
extra_template_paths:
- /usr/local/share/jupyter/custom_template
# Move the sqlite file used by https://github.com/jupyter-server/jupyter_server_fileid
# off the default path, which is under ~/.local/share/jupyter.
# That is NFS, and sqlite + NFS don't go well together. In addition,
# it uses WAL mode of sqlite, and that is completely unsupported on NFS
# Upstream discussion in https://github.com/jupyter-server/jupyter_server_fileid/issues/60.
BaseFileIdManager: &server_config_base_file_id_manager
db_path: /tmp/file_id_manager.db
jupyter_notebook_config.json:
mountPath: /usr/local/etc/jupyter/jupyter_notebook_config.json
data:
MappingKernelManager: *server_config_mapping_kernel_manager
NotebookApp: *server_config_server_app
BaseFileIdManager: *server_config_base_file_id_manager
startTimeout: 600 # 10 mins, node startup + image pulling sometimes takes more than the default 5min
defaultUrl: /tree
image:
name: quay.io/jupyter/scipy-notebook
tag: "2024-03-18"
storage:
type: static
static:
pvcName: home-nfs
subPath: "{username}"
extraVolumes:
- name: dev-shm
emptyDir:
medium: Memory
extraVolumeMounts:
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
readOnly: true
- name: dev-shm
mountPath: /dev/shm
# For all pods, mount home in both /home/jovyan (done via singleuser.storage.static)
# as well as /home/rstudio. This allows rocker images (which use the
# rstudio user and put home ine /home/rstudio) to be first class citizens
# along with jupyter based images, regardless of how they are specified (
# via the configurator, or with unlisted_choice, or as a profile). For non-rocker
# images, this is just invisible in the UI and there is no performance overhead
# for these extra bind mounts. An additional positive here is that in case *students*
# end up accidentally hardcoding paths in their notebooks, it will continue to work
# regardless of whether they or on RStudio or JupyterLab (described to us as a serious
# problem by openscapes)
- name: home
mountPath: /home/rstudio
subPath: "{username}"
- name: home
mountPath: /home/rstudio/shared
subPath: _shared
readOnly: true
memory:
guarantee: 256M
limit: 1G
cpu:
# If no CPU limit is set, it is possible for a single user or group of users to
# starve everyone else of CPU time on a node, even causing new user pods to completely
# fail as the notebook server process gets no CPU to complete auth handshake with
# the server, and even trivial cells like `print("hello world")` may not run.
# Unlike memory guarantees, CPU guarantees are actually enforced by the Linux Kernel
# (see https://medium.com/@betz.mark/understanding-resource-limits-in-kubernetes-cpu-time-9eff74d3161b)
# By giving each user a 5% CPU guarantee (represented by 0.05), we ensure that:
# 1. Simple cells will always execute
# 2. Notebook server processes will always start - so users won't have server spawn failure
# 3. We don't accidentally set just a high limit for a particular hub and not set a
# guarantee, at which point kubernetes treats the limit as the guarantee! This causes
# far more nodes to be scaled up than needed, making everything super slow (like in
# https://github.com/2i2c-org/infrastructure/issues/790)
# 4. Most of our workloads are still memory bound, and we want scaling to happen only
# when a node is full on its memory guarantees. But a 0.05 guarantee means a n1-highmem-8
# node can fit 160 user pods, and since kubernetes already caps us at 100 pods a node,
# this guarantee doesn't actually change our scheduling.
guarantee: 0.05
networkPolicy:
enabled: true
# Egress to internet is allowed by default via z2jh's egressAllowRules,
# but we need to add a few custom rules for the cluster internal
# networking.
egress:
# Allow code in hubs to talk to ingress provider, so they can talk to
# the hub via its public URL
- to:
- namespaceSelector:
matchLabels:
name: support
podSelector:
matchLabels:
app.kubernetes.io/name: ingress-nginx
# If a hub is using autohttps instead of ingress-nginx, allow traffic
# to the autohttps pod as well
- to:
- podSelector:
matchLabels:
app: jupyterhub
component: autohttps
# Allow traffic to the proxy pod from user pods
# This is particularly important for daskhubs that utilise the proxy
# in order to create clusters (schedulers and workers)
- to:
- podSelector:
matchLabels:
app: jupyterhub
component: proxy
# Allow traffic to the traefik pod from user pods. Needed for daskhubs.
- to:
- podSelector:
matchLabels:
app.kubernetes.io/component: traefik
# Allow HTTPS and HTTP traffic explicitly to the whole world
# This is a no-op if `singleuser.networkPolicy.egressAllowRules.nonPrivateIPs`
# is true (the default). When it is set to false, this rule allows outbound access
# to these specified ports to the broad internet (but not internal networks)
- ports:
- port: 80
protocol: TCP
- port: 443
protocol: TCP
- port: 443
protocol: UDP # Hello, HTTPS/3
to:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
hub:
config:
JupyterHub:
# Allow unauthenticated prometheus requests
# Otherwise our prometheus server can't get hub metrics
authenticate_prometheus: false
KubeSpawner:
# Make sure working directory is where we mount the home folder
working_dir: /home/jovyan
# Increase timeout for Jupyter server to become 'ready', until
# https://github.com/2i2c-org/infrastructure/issues/2047 is fixed
http_timeout: 120
Authenticator:
# Don't allow test username to login into the hub
# The test service will still be able to create this hub username
# and start their server.
# Ref: https://github.com/2i2c-org/meta/issues/321
blocked_users:
- deployment-service-check
extraFiles:
configurator-schema-default:
mountPath: /usr/local/etc/jupyterhub-configurator/00-default.schema.json
data:
type: object
name: config
properties:
KubeSpawner.image:
type: string
title: User docker image
description: Determines languages, libraries and interfaces available
help: Leave this blank to use the default
Spawner.default_url:
type: string
title: Default User Interface
enum:
- "/tree"
- "/lab"
- "/rstudio"
default: "/tree"
enumMetadata:
interfaces:
- value: "/tree"
title: Classic Notebook
description: >-
The original single-document interface for creating
Jupyter Notebooks.
- value: "/lab"
title: JupyterLab
description: A Powerful next generation notebook interface
- value: "/rstudio"
title: RStudio
description: An IDE For R, created by the RStudio company
extraEnv:
BASEHUB_K8S_DIST:
valueFrom:
configMapKeyRef:
name: basehub-cluster-info
key: K8S_DIST
initContainers:
- name: templates-clone
image: alpine/git:2.40.1
command:
- /bin/sh
args:
- -c
# Remove the existing repo first if it exists, as otherwise we will
# error out when the pod restarts. /srv/extra-templates-dir is an
# emptyDir volume, so it is *not* cleaned up when the pod's containers restarts -
# only when the pod is deleted and cleaned back up.
# We also mount the emptyDir in `/srv/extra-templates-dir` but
# clone into a *subdirectory*, as the mount itself is owned by
# root, and git freaks out when that is the case. By putting
# the repo in a sub directory, we avoid the permission problems.
- |
rm -rf /srv/extra-templates-dir/repo;
git clone ${GIT_REPO_URL} /srv/extra-templates-dir/repo
env:
- name: GIT_REPO_URL
valueFrom:
configMapKeyRef:
name: hub-custom-templates-config
key: GIT_REPO_URL
securityContext:
runAsUser: 1000
runAsGroup: 1000
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
volumeMounts:
- name: custom-templates
mountPath: /srv/extra-templates-dir
extraContainers:
- name: templates-sync
image: alpine/git:2.40.1
workingDir: /srv/extra-templates-dir/repo
command:
- /bin/sh
args:
- -c
- |
handle_sigterm() {
echo "SIGTERM received, terminating...";
exit;
}
trap handle_sigterm SIGTERM;
echo "Starting template sync...";
echo "";
echo "Info about local git repo to be synced:";
(
# set -x causes commands run to be printed, helping log readers
# understand what the generated output is about. set -x is
# configured within a subshell to just print info about the
# specific chosen commands and avoid printing info about running
# "echo", "sleep", "set +x", or similar commands.
set -x;
git remote -v;
ls -lhd /srv/extra-templates-dir/repo;
)