-
Notifications
You must be signed in to change notification settings - Fork 2
/
ray_launch_cluster.sh
executable file
·158 lines (119 loc) · 3.3 KB
/
ray_launch_cluster.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash
#Run ray on LSF.
#eg:bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -o std.out%J -e std.out%J bash -i ray_launch_cluster.sh
#bias to selection of higher range ports
function getfreeport()
{
CHECK="do while"
while [[ ! -z $CHECK ]]; do
port=$(( ( RANDOM % 40000 ) + 20000 ))
CHECK=$(netstat -a | grep $port)
done
echo $port
}
while getopts ":c:n:m:" option;do
case "${option}" in
c) c=${OPTARG}
user_command=$c
;;
n) n=${OPTARG}
conda_env=$n
;;
m) m=${OPTARG}
object_store_mem=$m
;;
*) echo "Did not supply the correct arguments"
;;
esac
done
#use bash -i to activate conda env when the script is launched
#or use the below syntax.
if [ -z "$conda_env" ]
then
echo "No conda env provided, is ray installed?"
else
eval "$(conda shell.bash hook)"
conda activate $conda_env
fi
hosts=()
for host in `cat $LSB_DJOB_HOSTFILE | uniq`
do
echo "Adding host: " $host
hosts+=($host)
done
echo "The host list is: " "${hosts[@]}"
port=$(getfreeport)
echo "Head node will use port: " $port
export port
dashboard_port=$(getfreeport)
echo "Dashboard will use port: " $dashboard_port
echo "Num cpus per host is:" $LSB_MCPU_HOSTS
IFS=' ' read -r -a array <<< "$LSB_MCPU_HOSTS"
declare -A associative
i=0
len=${#array[@]}
while [ $i -lt $len ]
do
key=${array[$i]}
value=${array[$i+1]}
associative[$key]+=$value
i=$((i=i+2))
done
num_gpu=0
if [ -z "$CUDA_VISIBLE_DEVICES" ]
then
num_gpu=0
else
num_gpu=`echo $CUDA_VISIBLE_DEVICES | tr , ' ' | wc -w`
fi
#Assumption only one head node and more than one
#workers will connect to head node
head_node=${hosts[0]}
export head_node
echo "Object store memory for the cluster is set to 4GB"
echo "Starting ray head node on: " ${hosts[0]}
if [ -z $object_store_mem ]
then
echo "using default object store mem of 4GB make sure your cluster has mem greater than 4GB"
object_store_mem=4000000000
else
echo "The object store memory in bytes is: " "$object_store_mem"
fi
num_cpu_for_head=${associative[$head_node]}
command_launch="blaunch -z ${hosts[0]} ray start --head --port $port --dashboard-port $dashboard_port --num-cpus $num_cpu_for_head --num-gpus $num_gpu --object-store-memory $object_store_mem"
$command_launch &
sleep 20
command_check_up="ray status --address $head_node:$port"
while ! $command_check_up
do
sleep 3
done
workers=("${hosts[@]:1}")
echo "adding the workers to head node: " "${workers[*]}"
#run ray on worker nodes and connect to head
for host in "${workers[@]}"
do
echo "starting worker on: " $host "and using master node: " $head_node
sleep 10
num_cpu=${associative[$host]}
command_for_worker="blaunch -z $host ray start --address $head_node:$port --num-cpus $num_cpu --num-gpus $num_gpu --object-store-memory $object_store_mem"
$command_for_worker &
sleep 10
command_check_up_worker="blaunch -z $host ray status --address $head_node:$port"
while ! $command_check_up_worker
do
sleep 3
done
done
#Run workload
#eg of user workload python sample_code_for_ray.py
echo "Running user workload: " $user_command
$user_command
if [ $? != 0 ]; then
echo "Failure: " $?
exit $?
else
echo "Done"
echo "Shutting down the Job"
bkill $LSB_JOBID
fi