-
Notifications
You must be signed in to change notification settings - Fork 63
/
aw-raycluster.yaml
156 lines (156 loc) · 7.38 KB
/
aw-raycluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
apiVersion: workload.codeflare.dev/v1beta1
kind: AppWrapper
metadata:
name: raycluster-complete
namespace: default
spec:
resources:
GenericItems:
- replicas: 1
custompodresources:
# Each item in the custompodresources stanza should include resources consumed by target Item.
# In this example, the 2 items correspond to 1 Ray head pod and 1 Ray worker pod
- replicas: 1
limits:
cpu: 1
memory: 2G
nvidia.com/gpu: 0
requests:
cpu: 1
memory: 2G
nvidia.com/gpu: 0
# The replica should match the number of worker pods
- replicas: 1
limits:
cpu: 2
memory: 2G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 2G
nvidia.com/gpu: 0
generictemplate:
# The resource requests and limits in this config are too small for production!
# For examples with more realistic resource configuration, see
# ray-cluster.complete.large.yaml and
# ray-cluster.autoscaler.large.yaml.
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
# A unique identifier for the head node and workers of this cluster.
name: raycluster-complete
spec:
rayVersion: '2.5.0'
# Ray head pod configuration
headGroupSpec:
# Kubernetes Service Type. This is an optional field, and the default value is ClusterIP.
# Refer to https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types.
serviceType: ClusterIP
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
dashboard-host: '0.0.0.0'
# pod template
template:
metadata:
# Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`.
# Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
labels: {}
spec:
containers:
- name: ray-head
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
# The resource requests and limits in this config are too small for production!
# For an example with more realistic resource configuration, see
# ray-cluster.autoscaler.large.yaml.
# It is better to use a few large Ray pod than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "1"
memory: "2G"
requests:
# For production use-cases, we recommend specifying integer CPU reqests and limits.
# We also recommend setting requests equal to limits for both CPU and memory.
cpu: "1"
memory: "2G"
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
minReplicas: 1
maxReplicas: 1
# logical group name, for this called small-group, also can be functional
groupName: small-group
# If worker pods need to be added, we can increment the replicas.
# If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.
# The operator will remove pods from the list until the desired number of replicas is satisfied.
# If the difference between the current replica count and the desired replicas is greater than the
# number of entries in workersToDelete, random worker pods will be deleted.
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams: {}
#pod template
template:
spec:
containers:
- name: ray-worker
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
# use volumeMounts.Optional.
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
# The resource requests and limits in this config are too small for production!
# For an example with more realistic resource configuration, see
# ray-cluster.autoscaler.large.yaml.
# It is better to use a few large Ray pod than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "2"
memory: "2G"
# For production use-cases, we recommend specifying integer CPU requests and limits.
# We also recommend setting requests equal to limits for both CPU and memory.
requests:
# For production use-cases, we recommend specifying integer CPU requests and limits.
# We also recommend setting requests equal to limits for both CPU and memory.
cpu: "2"
# For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.
memory: "2G"
# use volumes
# Refer to https://kubernetes.io/docs/concepts/storage/volumes/
volumes:
- name: ray-logs
emptyDir: {}