Add comment on #GPUs. Use user specific temporary directory.

IBMSpectrumComputing · Feb 6, 2023 · 44c9d38 · 44c9d38
1 parent fcdf7c0
commit 44c9d38
Showing 1 changed file with 22 additions and 7 deletions.
diff --git a/ray_launch_cluster.sh b/ray_launch_cluster.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 #Run ray on LSF.
-#eg:bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -o std.out%J -e std.out%J bash -i ray_launch_cluster.sh
-
+#Examples
+# bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -gpu "num=2" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
+#   Run a workload on two nodes. Each node has single core and 2 GPUs. Nodes are placed on separated hosts.
+# bsub -n 4 -R "affinity[core(7,same=socket)]" -gpu "num=2/task" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
+#   Run a workload on 4 nodes. Each node has 7 cores and 2 GPUs.
+#   "/task" for GPU option is necessary because multiple nodes may run on a same host. Otherwise 2 GPUs on a host will be shared by all nodes (tasks) on the host.
 echo "LSB_MCPU_HOSTS=$LSB_MCPU_HOSTS"
 echo "---- LSB_AFFINITY_HOSTFILE=$LSB_AFFINITY_HOSTFILE"
 cat $LSB_AFFINITY_HOSTFILE
@@ -10,6 +14,11 @@ echo "---- LSB_DJOB_HOSTFILE=$LSB_DJOB_HOSTFILE"
 cat $LSB_DJOB_HOSTFILE
 echo "---- End of LSB_DJOB_HOSTFILE"
 
+# Use user specific temporary folder for multi-tenancy environment
+export RAY_TMPDIR="/tmp/ray-$USER"
+echo "RAY_TMPDIR=$RAY_TMPDIR"
+mkdir -p $RAY_TMPDIR
+
 #bias to selection of higher range ports
 function getfreeport()
 {
@@ -67,9 +76,15 @@ export port
 dashboard_port=$(getfreeport)
 echo "Dashboard will use port: $dashboard_port"
 
+# Compute number of cores allocated to hosts
 # Format of each line in file $LSB_AFFINITY_HOSTFILE:
-#   host_name cpu_id_list NUMA_node_id_list memory_policy
-# cpu_id_list is comma separeted numbers.
+#   host_name core_id_list NUMA_node_id_list memory_policy
+# core_id_list is comma separeted core IDs. e.g.
+#   host1 1,2,3,4,5,6,7
+#   host2 0,2,3,4,6,7,8
+#   host2 19,21,22,23,24,26,27
+#   host2 28,29,37,41,48,49,50
+# First, count up number of cores for each line (slot), then sum up for same host.
 declare -A associative
 while read -a line
 do
@@ -97,11 +112,11 @@ then
     echo "using default object store mem of 4GB make sure your cluster has mem greater than 4GB"
     object_store_mem=4000000000
 else
-    echo "The object store memory in bytes is: " "$object_store_mem"
+    echo "The object store memory in bytes is: $object_store_mem"
 fi
 
 num_cpu_for_head=${associative[$head_node]}
-
+# Number of GPUs available for each host is detected "ray start" command
 command_launch="blaunch -z ${hosts[0]} ray start --head --port $port --dashboard-port $dashboard_port --num-cpus $num_cpu_for_head --object-store-memory $object_store_mem"
 
 $command_launch &
@@ -121,7 +136,7 @@ done
 
 workers=("${hosts[@]:1}")
 
-echo "adding the workers to head node: " "${workers[*]}"
+echo "adding the workers to head node: ${workers[*]}"
 #run ray on worker nodes and connect to head
 for host in "${workers[@]}"
 do