From 2d965cdbc8351dd57efa1a52493d87c680e210b0 Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:24:25 +0800 Subject: [PATCH 1/7] feat: change oneccl --- .../example/GPU/Deepspeed-AutoTP/README.md | 3 ++- .../GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 8 +++++--- .../run_llama2_70b_pvc_1550_1_card.sh | 19 ++++++++++++------- .../run_mistral_7b_instruct_flex_2_card.sh | 3 ++- .../run_qwen_14b_arc_2_card.sh | 3 ++- .../run_vicuna_33b_arc_2_card.sh | 3 ++- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 515a74fee8c..7e54200e375 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -20,7 +20,8 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install transformers==4.37.0 -pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.5.1-release/oneccl_wks_installer_2024.0.0.5.1.sh +bash oneccl_wks_installer_2024.0.0.5.1.sh # configures OneAPI environment variables source /opt/intel/oneapi/setvars.sh pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5 diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index e2ebbaf4f68..ccf37507c09 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -77,7 +77,7 @@ def get_int_from_env(env_keys, default): torch_dtype=torch.float16, trust_remote_code=True, use_cache=True) - + model = deepspeed.init_inference( model, mp_size=world_size, @@ -104,7 +104,8 @@ def get_int_from_env(env_keys, default): deepspeed.comm.comm.cdb = None from deepspeed.comm.comm import init_distributed init_distributed() - + from ipex_llm.utils import BenchmarkWrapper + model = BenchmarkWrapper(model) print(model) # Load tokenizer @@ -135,7 +136,8 @@ def get_int_from_env(env_keys, default): actual_output_len = output.shape[1] - input_ids.shape[1] output_str = tokenizer.decode(output[0], skip_special_tokens=True) avg_time = (end - st) / actual_output_len * 1000 - print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, average token latency is {avg_time} ms/token.') + print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s') + print('-'*20, 'Prompt', '-'*20) print(prompt) print('-'*20, 'Output', '-'*20) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 37e53545545..df6f985b995 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -14,8 +14,9 @@ # limitations under the License. # -export ZE_AFFINITY_MASK="0,1" # specify the used GPU -NUM_GPUS=2 # number of used GPU +## Validated BKC for Qwen1.5-14B-Chat on 2 ARC with +## Ubuntu 22.04.4, kernel 6.5.0-27-generic, level-zero 1.14.0, NEO(compute runtime) 24.09.28717.12 + export MASTER_ADDR=127.0.0.1 export FI_PROVIDER=tcp export CCL_ATL_TRANSPORT=ofi @@ -24,11 +25,15 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh -export OMP_NUM_THREADS=$((56/$NUM_GPUS)) -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 -export TORCH_LLM_ALLREDUCE=1 +NUM_GPUS=2 # number of used GPU +export USE_XETLA=OFF +if grep -q "Core" /proc/cpuinfo; then + export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +fi +export TORCH_LLM_ALLREDUCE=0 # Different from PVC export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ - python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4' + python deepspeed_autotp.py --repo-id-or-model-path '/mnt/disk1/models/Llama-2-13b-chat-hf/' --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh index 177ed5d0625..f0a0e2042f8 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh @@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh index 6686d3ee295..3dca705d110 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh @@ -25,7 +25,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh index 7cf50a5e283..21ab80a8cd4 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh @@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF From d11d88c1f8d7f2d223c0104dc9fb1349fd586972 Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:26:33 +0800 Subject: [PATCH 2/7] fix: restore llama-70b --- .../run_llama2_70b_pvc_1550_1_card.sh | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index df6f985b995..37e53545545 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -14,9 +14,8 @@ # limitations under the License. # -## Validated BKC for Qwen1.5-14B-Chat on 2 ARC with -## Ubuntu 22.04.4, kernel 6.5.0-27-generic, level-zero 1.14.0, NEO(compute runtime) 24.09.28717.12 - +export ZE_AFFINITY_MASK="0,1" # specify the used GPU +NUM_GPUS=2 # number of used GPU export MASTER_ADDR=127.0.0.1 export FI_PROVIDER=tcp export CCL_ATL_TRANSPORT=ofi @@ -25,15 +24,11 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -# source $basekit_root/ccl/latest/env/vars.sh --force -source /opt/intel/1ccl-wks/setvars.sh +source $basekit_root/ccl/latest/env/vars.sh --force -NUM_GPUS=2 # number of used GPU -export USE_XETLA=OFF -if grep -q "Core" /proc/cpuinfo; then - export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 -fi -export TORCH_LLM_ALLREDUCE=0 # Different from PVC +export OMP_NUM_THREADS=$((56/$NUM_GPUS)) +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +export TORCH_LLM_ALLREDUCE=1 export BIGDL_IMPORT_IPEX=0 mpirun -np $NUM_GPUS --prepend-rank \ - python deepspeed_autotp.py --repo-id-or-model-path '/mnt/disk1/models/Llama-2-13b-chat-hf/' --low-bit 'sym_int4' + python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4' From 0b2029aed15a349fe94890a07bd033b3dfb347af Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:28:29 +0800 Subject: [PATCH 3/7] fix: remove tab --- python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index ccf37507c09..fa4d1b3b0e7 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -77,7 +77,7 @@ def get_int_from_env(env_keys, default): torch_dtype=torch.float16, trust_remote_code=True, use_cache=True) - + model = deepspeed.init_inference( model, mp_size=world_size, From 46d9d51d94430a9272131786cfc3d7f168082c57 Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:29:04 +0800 Subject: [PATCH 4/7] fix: remove extra blank --- python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index fa4d1b3b0e7..8746cb02717 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -137,7 +137,6 @@ def get_int_from_env(env_keys, default): output_str = tokenizer.decode(output[0], skip_special_tokens=True) avg_time = (end - st) / actual_output_len * 1000 print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s') - print('-'*20, 'Prompt', '-'*20) print(prompt) print('-'*20, 'Output', '-'*20) From fe2e1a9dd2ba1a0e9701e43fd8162a3f307be95a Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:30:39 +0800 Subject: [PATCH 5/7] small fix --- .../GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 37e53545545..2eb2f0180d7 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -24,7 +24,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh export OMP_NUM_THREADS=$((56/$NUM_GPUS)) export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 From 5ff4e23e12b9531885a48c561fb73485059ce455 Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Wed, 30 Oct 2024 10:32:55 +0800 Subject: [PATCH 6/7] add comments --- .../GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh | 2 +- .../GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh | 2 +- .../llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh | 2 +- .../example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 2eb2f0180d7..5c3d8cacf4b 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -24,7 +24,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -# source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance source /opt/intel/1ccl-wks/setvars.sh export OMP_NUM_THREADS=$((56/$NUM_GPUS)) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh index f0a0e2042f8..833b7f348fc 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh @@ -22,7 +22,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -# source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh index 3dca705d110..2535622a4db 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh @@ -25,7 +25,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -# source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh index 21ab80a8cd4..293032ab991 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh @@ -22,7 +22,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -# source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU From 303d5c89914e5b8590adb9dc301f033cd11310f0 Mon Sep 17 00:00:00 2001 From: cranechu0131 <1340390339@qq.com> Date: Thu, 31 Oct 2024 09:46:56 +0800 Subject: [PATCH 7/7] fix: add a blank space --- python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index 8746cb02717..8df91b12905 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -136,7 +136,7 @@ def get_int_from_env(env_keys, default): actual_output_len = output.shape[1] - input_ids.shape[1] output_str = tokenizer.decode(output[0], skip_special_tokens=True) avg_time = (end - st) / actual_output_len * 1000 - print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s') + print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s') print('-'*20, 'Prompt', '-'*20) print(prompt) print('-'*20, 'Output', '-'*20)