-
Notifications
You must be signed in to change notification settings - Fork 547
/
kubernetes-ray.yml.j2
444 lines (406 loc) · 17.7 KB
/
kubernetes-ray.yml.j2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
cluster_name: {{cluster_name_on_cloud}}
# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: {{num_nodes - 1}}
upscaling_speed: {{num_nodes - 1}}
idle_timeout_minutes: 60
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: external
module: sky.provision.kubernetes
region: kubernetes
namespace: {{k8s_namespace}}
# The kubecontext used to connect to the Kubernetes cluster.
{% if k8s_context is not none %}
context: {{k8s_context}}
{% endif %}
# This should be one of KubernetesPortMode
port_mode: {{k8s_port_mode}}
# The networking mode used to ssh to pods. One of KubernetesNetworkingMode.
networking_mode: {{k8s_networking_mode}}
# We use internal IPs since we set up a port-forward between the kubernetes
# cluster and the local machine, or directly use NodePort to reach the
# head node.
use_internal_ips: true
timeout: {{timeout}}
ssh_jump_image: {{k8s_ssh_jump_image}}
# Namespace used to host SkyPilot system components, such as fuse device
# manager.
skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
# Boolean flag to indicate if the cluster requires FUSE mounting.
# Used to set up the necessary permissions and sidecars.
fuse_device_required: {{k8s_fuse_device_required}}
# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
# contain a user-created service account with the proper permissions.
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
parent: skypilot
name: skypilot-service-account
# Role created by the autoscaler for the head node pod that it runs in.
# If this field isn't provided, the role referenced in
# autoscaler_role_binding must exist and have at least these permissions.
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
parent: skypilot
name: skypilot-service-account-role
# TODO(romilb): This is a very permissive role - gives all access in the
# namespace. We should restrict this. For reference, this is required
# for autodown and creating more SkyPilot clusters from within the pod.
rules:
- apiGroups: ["*"]
resources: ["*"]
verbs: ["*"]
# RoleBinding created by the autoscaler for the head node pod that it runs
# in. If this field isn't provided, the head pod config below must contain
# a user-created service account with the proper permissions.
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
parent: skypilot
name: skypilot-service-account-role-binding
subjects:
- kind: ServiceAccount
name: skypilot-service-account
roleRef:
kind: Role
name: skypilot-service-account-role
apiGroup: rbac.authorization.k8s.io
# Role for the skypilot-system namespace to create FUSE device manager and
# any other system components.
autoscaler_skypilot_system_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
namespace: {{k8s_skypilot_system_namespace}}
labels:
parent: skypilot
name: skypilot-system-service-account-role
rules:
- apiGroups: ["*"]
resources: ["*"]
verbs: ["*"]
# RoleBinding for skypilot-system namespace role
autoscaler_skypilot_system_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: {{k8s_skypilot_system_namespace}}
labels:
parent: skypilot
name: skypilot-system-service-account-role-binding
subjects:
- kind: ServiceAccount
name: skypilot-service-account
roleRef:
kind: Role
name: skypilot-system-service-account-role
apiGroup: rbac.authorization.k8s.io
# Role to access ingress services for fetching IP
autoscaler_ingress_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
namespace: ingress-nginx
name: skypilot-service-account-ingress-role
labels:
parent: skypilot
rules:
- apiGroups: [ "" ]
resources: [ "services" ]
verbs: [ "list", "get", "watch" ]
- apiGroups: [ "rbac.authorization.k8s.io" ]
resources: [ "roles", "rolebindings" ]
verbs: [ "get", "list", "watch", "patch" ]
# RoleBinding to access ingress services for fetching IP
autoscaler_ingress_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: ingress-nginx
name: skypilot-service-account-ingress-role-binding
labels:
parent: skypilot
subjects:
- kind: ServiceAccount
name: skypilot-service-account
roleRef:
kind: Role
name: skypilot-service-account-ingress-role
apiGroup: rbac.authorization.k8s.io
# In addition to a role binding, we also need a cluster role binding to give
# the SkyPilot access to the cluster-wide resources such as nodes to get
# node resources.
autoscaler_cluster_role:
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
parent: skypilot
name: skypilot-service-account-cluster-role
rules:
- apiGroups: [ "" ]
resources: [ "nodes" ] # Required for getting node resources.
verbs: [ "get", "list", "watch" ]
- apiGroups: [ "" ]
resources: [ "namespaces" ] # Required for creating skypilot-system namespace, which hosts fuse device manager.
verbs: [ "get", "list", "watch", "create" ]
- apiGroups: [ "rbac.authorization.k8s.io" ]
resources: [ "clusterroles", "clusterrolebindings" ] # Required for launching more SkyPilot clusters from within the pod.
verbs: [ "get", "list", "watch", "create", "update", "patch", "delete" ]
- apiGroups: [ "node.k8s.io" ]
resources: [ "runtimeclasses" ] # Required for autodetecting the runtime class of the nodes.
verbs: [ "get", "list", "watch" ]
- apiGroups: [ "networking.k8s.io" ] # Required for exposing services.
resources: [ "ingressclasses" ]
verbs: [ "get", "list", "watch" ]
# Bind cluster role to the service account
autoscaler_cluster_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
parent: skypilot
name: skypilot-service-account-cluster-role-binding
subjects:
- kind: ServiceAccount
name: skypilot-service-account
roleRef:
kind: ClusterRole
name: skypilot-service-account-cluster-role
apiGroup: rbac.authorization.k8s.io
services:
# Service to expose the head node pod's SSH port.
- apiVersion: v1
kind: Service
metadata:
labels:
parent: skypilot
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
name: {{cluster_name_on_cloud}}-head-ssh
spec:
selector:
component: {{cluster_name_on_cloud}}-head
ports:
- protocol: TCP
port: 22
targetPort: 22
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
labels:
parent: skypilot
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: {{cluster_name_on_cloud}}-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: {{cluster_name_on_cloud}}-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
# Specify the pod type for the ray head node (as configured below).
head_node_type: ray_head_default
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
ray_head_default:
node_config:
apiVersion: v1
kind: Pod
metadata:
# name will be filled in the provisioner
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
# service is required.
labels:
parent: skypilot
# component will be set for the head node pod to be the same as the head node service selector above if a
skypilot-cluster: {{cluster_name_on_cloud}}
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
skypilot-user: {{ user }}
# Custom tags for the pods
{%- for label_key, label_value in labels.items() %}
{{ label_key }}: {{ label_value|tojson }}
{%- endfor %}
{% if k8s_fuse_device_required %}
annotations:
# Required for FUSE mounting to access /dev/fuse
container.apparmor.security.beta.kubernetes.io/ray-node: unconfined
{% endif %}
spec:
# serviceAccountName: skypilot-service-account
serviceAccountName: {{k8s_service_account_name}}
automountServiceAccountToken: {{k8s_automount_sa_token}}
restartPolicy: Never
# Add node selector if GPUs are requested:
{% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
nodeSelector:
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
{% endif %}
{% if k8s_spot_label_key is not none %}
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
{% endif %}
{% endif %}
{% if k8s_spot_label_key is not none %}
tolerations:
- key: {{k8s_spot_label_key}}
operator: Equal
value: {{k8s_spot_label_value|tojson}}
effect: NoSchedule
{% endif %}
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: secret-volume
secret:
secretName: {{k8s_ssh_key_secret_name}}
- name: dshm
emptyDir:
medium: Memory
- name: dev-fuse # Required for fuse mounting
hostPath:
path: /dev/fuse
containers:
- name: ray-node
imagePullPolicy: IfNotPresent
image: {{image_id}}
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args:
- |
# Tails file and checks every 5 sec for
# open file handlers with write access
# closes if none exist
monitor_file() {
tail -f $file &
TAIL_PID=$!
while kill -0 $TAIL_PID 2> /dev/null; do
# only two PIDs should be accessing the file
# the log appender and log tailer
if [ $(lsof -w $file | wc -l) -lt 3 ]; then
kill $TAIL_PID
break
fi
# Sleep for 5 seconds before checking again. Do not make this
# too short as it will consume CPU, and too long will cause
# the file to be closed too late keeping the pod alive.
sleep 5
done
}
log_tail() {
FILE_PATTERN="~/sky_logs/*/tasks/*.log"
while ! ls $(eval echo $FILE_PATTERN) 1> /dev/null 2>&1; do
sleep 1
done
# Keep track of already monitored files
already_monitored=""
# Infinite loop to continuously check for new files
while true; do
for file in $(eval echo $FILE_PATTERN); do
if echo $already_monitored | grep -q $file; then
# File is already being monitored
continue
fi
# Monitor the new file
monitor_file $file &
already_monitored="${already_monitored} ${file}"
done
sleep 0.1
done
}
trap : TERM INT; log_tail || sleep infinity & wait
ports:
- containerPort: 22 # Used for SSH
- containerPort: {{ray_port}} # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: {{ray_dashboard_port}} # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- name: secret-volume
readOnly: true
mountPath: "/etc/secret-volume"
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
- mountPath: /dev/shm
name: dshm
{% if k8s_fuse_device_required %}
securityContext:
capabilities:
add:
- "SYS_ADMIN"
{% endif %}
resources:
requests:
cpu: {{cpus}}
memory: {{memory}}G
nvidia.com/gpu: {{accelerator_count}}
{% if k8s_fuse_device_required %}
# Kubernetes resource exposed by the fuse device manager
# https://gitlab.com/arm-research/smarter/smarter-device-manager
smarter-devices/fuse: "1"
{% endif %}
limits:
nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests
{% if k8s_fuse_device_required %}
smarter-devices/fuse: "1"
{% endif %}
setup_commands:
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
# Create ~/.ssh/config file in case the file does not exist in the image.
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo DEBIAN_FRONTEND=noninteractive apt install lsof gcc patch pciutils rsync fuse curl -y;
mkdir -p ~/.ssh; touch ~/.ssh/config;
{%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
{{ conda_installation_commands }}
{{ ray_skypilot_installation_commands }}
sudo touch ~/.sudo_as_admin_successful;
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
# Format: `REMOTE_PATH : LOCAL_PATH`
file_mounts: {
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
{%- for remote_path, local_path in credentials.items() %}
"{{remote_path}}": "{{local_path}}",
{%- endfor %}
}
auth:
ssh_user: sky
ssh_private_key: {{ssh_private_key}}