Skip to content

Commit

Permalink
Add comment on #GPUs. Use user specific temporary directory.
Browse files Browse the repository at this point in the history
  • Loading branch information
takaomoriyama committed Feb 6, 2023
1 parent fcdf7c0 commit 44c9d38
Showing 1 changed file with 22 additions and 7 deletions.
29 changes: 22 additions & 7 deletions ray_launch_cluster.sh
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/bin/bash
#Run ray on LSF.
#eg:bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -o std.out%J -e std.out%J bash -i ray_launch_cluster.sh

#Examples
# bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -gpu "num=2" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
# Run a workload on two nodes. Each node has single core and 2 GPUs. Nodes are placed on separated hosts.
# bsub -n 4 -R "affinity[core(7,same=socket)]" -gpu "num=2/task" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
# Run a workload on 4 nodes. Each node has 7 cores and 2 GPUs.
# "/task" for GPU option is necessary because multiple nodes may run on a same host. Otherwise 2 GPUs on a host will be shared by all nodes (tasks) on the host.
echo "LSB_MCPU_HOSTS=$LSB_MCPU_HOSTS"
echo "---- LSB_AFFINITY_HOSTFILE=$LSB_AFFINITY_HOSTFILE"
cat $LSB_AFFINITY_HOSTFILE
Expand All @@ -10,6 +14,11 @@ echo "---- LSB_DJOB_HOSTFILE=$LSB_DJOB_HOSTFILE"
cat $LSB_DJOB_HOSTFILE
echo "---- End of LSB_DJOB_HOSTFILE"

# Use user specific temporary folder for multi-tenancy environment
export RAY_TMPDIR="/tmp/ray-$USER"
echo "RAY_TMPDIR=$RAY_TMPDIR"
mkdir -p $RAY_TMPDIR

#bias to selection of higher range ports
function getfreeport()
{
Expand Down Expand Up @@ -67,9 +76,15 @@ export port
dashboard_port=$(getfreeport)
echo "Dashboard will use port: $dashboard_port"

# Compute number of cores allocated to hosts
# Format of each line in file $LSB_AFFINITY_HOSTFILE:
# host_name cpu_id_list NUMA_node_id_list memory_policy
# cpu_id_list is comma separeted numbers.
# host_name core_id_list NUMA_node_id_list memory_policy
# core_id_list is comma separeted core IDs. e.g.
# host1 1,2,3,4,5,6,7
# host2 0,2,3,4,6,7,8
# host2 19,21,22,23,24,26,27
# host2 28,29,37,41,48,49,50
# First, count up number of cores for each line (slot), then sum up for same host.
declare -A associative
while read -a line
do
Expand Down Expand Up @@ -97,11 +112,11 @@ then
echo "using default object store mem of 4GB make sure your cluster has mem greater than 4GB"
object_store_mem=4000000000
else
echo "The object store memory in bytes is: " "$object_store_mem"
echo "The object store memory in bytes is: $object_store_mem"
fi

num_cpu_for_head=${associative[$head_node]}

# Number of GPUs available for each host is detected "ray start" command
command_launch="blaunch -z ${hosts[0]} ray start --head --port $port --dashboard-port $dashboard_port --num-cpus $num_cpu_for_head --object-store-memory $object_store_mem"

$command_launch &
Expand All @@ -121,7 +136,7 @@ done

workers=("${hosts[@]:1}")

echo "adding the workers to head node: " "${workers[*]}"
echo "adding the workers to head node: ${workers[*]}"
#run ray on worker nodes and connect to head
for host in "${workers[@]}"
do
Expand Down

0 comments on commit 44c9d38

Please sign in to comment.