diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md index 515a74fee8c..7e54200e375 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md +++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md @@ -20,7 +20,8 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install transformers==4.37.0 -pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.5.1-release/oneccl_wks_installer_2024.0.0.5.1.sh +bash oneccl_wks_installer_2024.0.0.5.1.sh # configures OneAPI environment variables source /opt/intel/oneapi/setvars.sh pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5 diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py index e2ebbaf4f68..8df91b12905 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -104,7 +104,8 @@ def get_int_from_env(env_keys, default): deepspeed.comm.comm.cdb = None from deepspeed.comm.comm import init_distributed init_distributed() - + from ipex_llm.utils import BenchmarkWrapper + model = BenchmarkWrapper(model) print(model) # Load tokenizer @@ -135,7 +136,7 @@ def get_int_from_env(env_keys, default): actual_output_len = output.shape[1] - input_ids.shape[1] output_str = tokenizer.decode(output[0], skip_special_tokens=True) avg_time = (end - st) / actual_output_len * 1000 - print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, average token latency is {avg_time} ms/token.') + print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s') print('-'*20, 'Prompt', '-'*20) print(prompt) print('-'*20, 'Output', '-'*20) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh index 37e53545545..5c3d8cacf4b 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh @@ -24,7 +24,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance +source /opt/intel/1ccl-wks/setvars.sh export OMP_NUM_THREADS=$((56/$NUM_GPUS)) export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh index 177ed5d0625..833b7f348fc 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh @@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh index 6686d3ee295..2535622a4db 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh @@ -25,7 +25,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh index 7cf50a5e283..293032ab991 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh +++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh @@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD} basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force deprecate oneccl_bind_pt and use internal oneccl for better performance +source /opt/intel/1ccl-wks/setvars.sh NUM_GPUS=2 # number of used GPU export USE_XETLA=OFF