forked from delock/ds_allreduce_bench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_comm_bench.sh
executable file
·41 lines (31 loc) · 2.05 KB
/
run_comm_bench.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/bin/bash
#export KMP_BLOCKTIME=1
#export KMP_SETTINGS=1
# export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
export LD_PRELOAD=${LD_PRELOAD}:$CONDA_PREFIX/../../pkgs/intel-openmp-2022.0.1-h06a4308_3633/lib/libiomp5.so
export LD_PRELOAD=${LD_PRELOAD}:$CONDA_PREFIX/../../pkgs/gperftools-2.10-h09c0d1c_0/lib/libtcmalloc.so
#export CCL_ALLREDUCE=recursive_doubling
export CCL_PROCESS_LAUNCHER=none
export FI_PROVIDER=tcp
export CCL_ATL_TRANSPORT=mpi
export CCL_ATL_TRANSPORT=ofi
export CCL_ATL_SHM=1
#export CCL_ITT_LEVEL=1
export CCL_WORKER_COUNT=1
#export CCL_WORKER_AFFINITY=10,22,34,46,58,70,82,94
#deepspeed --force_multi --hostfile hostfile.txt --launcher=impi --bind_cores_to_rank --bind_core_list 0-9,12-21,24-33,36-45,48-57,60-69,72-81,84-93 run_generation_with_deepspeed_profile.py --device cpu --dtype bfloat16 --num-iter 10 --num-warmup 4 -m facebook/opt-350m --input-tokens 22 --max-new-tokens 100 --benchmark --token-latency --greedy
#8 ranks
#PSM3_DEVICES=self,nic
export CCL_ZE_IPC_EXCHANGE=sockets
export IPEX_ROOT=$CONDA_PREFIX/lib/python3.9/site-packages/intel_extension_for_pytorch
export DS_ACCELERATOR=xpu
# export CCL_WORKER_AFFINITY=10,22,34,46,58,70,82,94
# deepspeed --bind_cores_to_rank --bind_core_list 0-9,12-21,24-33,36-45,48-57,60-69,72-81,84-93 ds_comm_bench.py
export CCL_WORKER_AFFINITY=3,11,19,27,35,43,54,62,70,78,86,94
deepspeed --bind_cores_to_rank --bind_core_list 1-2,9-10,17-18,25-26,33-34,41-42,52-53,60-61,68-69,76-77,84-85,92-93 ds_comm_bench.py
#export CCL_WORKER_AFFINITY=10,58
#deepspeed --num_accelerator 2 --bind_cores_to_rank --bind_core_list 0-9,12-21,24-33,36-45,48-57,60-69,72-81,84-93 ds_comm_bench.py
#8 ranks mpi launcher
#deepspeed --force_multi --hostfile=hostfile.txt --launcher impi --bind_cores_to_rank --bind_core_list 0-9,12-21,24-33,36-45,48-57,60-69,72-81,84-93 ds_comm_bench.py
#deepspeed --force_multi --hostfile=hostfile.txt --launcher impi --bind_cores_to_rank ds_comm_bench.py
#mpirun -ppn 8 ~/oneCCL/build/_install/examples/benchmark/benchmark -i 17920 -y 5120 -d bfloat16