From 2d965cdbc8351dd57efa1a52493d87c680e210b0 Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:24:25 +0800
Subject: [PATCH 1/7] feat: change oneccl

---
 .../example/GPU/Deepspeed-AutoTP/README.md    |  3 ++-
 .../GPU/Deepspeed-AutoTP/deepspeed_autotp.py  |  8 +++++---
 .../run_llama2_70b_pvc_1550_1_card.sh         | 19 ++++++++++++-------
 .../run_mistral_7b_instruct_flex_2_card.sh    |  3 ++-
 .../run_qwen_14b_arc_2_card.sh                |  3 ++-
 .../run_vicuna_33b_arc_2_card.sh              |  3 ++-
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/README.md b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
index 515a74fee8c..7e54200e375 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/README.md
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/README.md
@@ -20,7 +20,8 @@ conda activate llm
 # below command will install intel_extension_for_pytorch==2.1.10+xpu as default
 pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 pip install transformers==4.37.0
-pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.5.1-release/oneccl_wks_installer_2024.0.0.5.1.sh 
+bash oneccl_wks_installer_2024.0.0.5.1.sh
 # configures OneAPI environment variables
 source /opt/intel/oneapi/setvars.sh
 pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index e2ebbaf4f68..ccf37507c09 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -77,7 +77,7 @@ def get_int_from_env(env_keys, default):
                                                  torch_dtype=torch.float16,
                                                  trust_remote_code=True,
                                                  use_cache=True)
-
+    
     model = deepspeed.init_inference(
         model,
         mp_size=world_size,
@@ -104,7 +104,8 @@ def get_int_from_env(env_keys, default):
     deepspeed.comm.comm.cdb = None
     from deepspeed.comm.comm import init_distributed
     init_distributed()
-
+    from ipex_llm.utils import BenchmarkWrapper
+    model = BenchmarkWrapper(model)
     print(model)
 
     # Load tokenizer
@@ -135,7 +136,8 @@ def get_int_from_env(env_keys, default):
             actual_output_len = output.shape[1] - input_ids.shape[1]
             output_str = tokenizer.decode(output[0], skip_special_tokens=True)
             avg_time = (end - st) / actual_output_len * 1000
-            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, average token latency is {avg_time} ms/token.')
+            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s')
+            
             print('-'*20, 'Prompt', '-'*20)
             print(prompt)
             print('-'*20, 'Output', '-'*20)
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
index 37e53545545..df6f985b995 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@@ -14,8 +14,9 @@
 # limitations under the License.
 #
 
-export ZE_AFFINITY_MASK="0,1" # specify the used GPU
-NUM_GPUS=2 # number of used GPU
+## Validated BKC for Qwen1.5-14B-Chat on 2 ARC with
+## Ubuntu 22.04.4, kernel 6.5.0-27-generic, level-zero 1.14.0, NEO(compute runtime) 24.09.28717.12
+
 export MASTER_ADDR=127.0.0.1
 export FI_PROVIDER=tcp
 export CCL_ATL_TRANSPORT=ofi
@@ -24,11 +25,15 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force  
+source /opt/intel/1ccl-wks/setvars.sh
 
-export OMP_NUM_THREADS=$((56/$NUM_GPUS))
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
-export TORCH_LLM_ALLREDUCE=1
+NUM_GPUS=2 # number of used GPU
+export USE_XETLA=OFF
+if grep -q "Core" /proc/cpuinfo; then
+    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
+fi
+export TORCH_LLM_ALLREDUCE=0 # Different from PVC
 export BIGDL_IMPORT_IPEX=0
 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'
+    python deepspeed_autotp.py --repo-id-or-model-path '/mnt/disk1/models/Llama-2-13b-chat-hf/' --low-bit 'sym_int4'
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
index 177ed5d0625..f0a0e2042f8 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
@@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force  
+source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU
 export USE_XETLA=OFF
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
index 6686d3ee295..3dca705d110 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
@@ -25,7 +25,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force  
+source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU
 export USE_XETLA=OFF
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
index 7cf50a5e283..21ab80a8cd4 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
@@ -22,7 +22,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force  
+source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU
 export USE_XETLA=OFF

From d11d88c1f8d7f2d223c0104dc9fb1349fd586972 Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:26:33 +0800
Subject: [PATCH 2/7] fix: restore llama-70b

---
 .../run_llama2_70b_pvc_1550_1_card.sh         | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
index df6f985b995..37e53545545 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@@ -14,9 +14,8 @@
 # limitations under the License.
 #
 
-## Validated BKC for Qwen1.5-14B-Chat on 2 ARC with
-## Ubuntu 22.04.4, kernel 6.5.0-27-generic, level-zero 1.14.0, NEO(compute runtime) 24.09.28717.12
-
+export ZE_AFFINITY_MASK="0,1" # specify the used GPU
+NUM_GPUS=2 # number of used GPU
 export MASTER_ADDR=127.0.0.1
 export FI_PROVIDER=tcp
 export CCL_ATL_TRANSPORT=ofi
@@ -25,15 +24,11 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-# source $basekit_root/ccl/latest/env/vars.sh --force  
-source /opt/intel/1ccl-wks/setvars.sh
+source $basekit_root/ccl/latest/env/vars.sh --force
 
-NUM_GPUS=2 # number of used GPU
-export USE_XETLA=OFF
-if grep -q "Core" /proc/cpuinfo; then
-    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
-fi
-export TORCH_LLM_ALLREDUCE=0 # Different from PVC
+export OMP_NUM_THREADS=$((56/$NUM_GPUS))
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
+export TORCH_LLM_ALLREDUCE=1
 export BIGDL_IMPORT_IPEX=0
 mpirun -np $NUM_GPUS --prepend-rank \
-    python deepspeed_autotp.py --repo-id-or-model-path '/mnt/disk1/models/Llama-2-13b-chat-hf/' --low-bit 'sym_int4'
+    python deepspeed_autotp.py --repo-id-or-model-path 'meta-llama/Llama-2-70b-chat-hf' --low-bit 'sym_int4'

From 0b2029aed15a349fe94890a07bd033b3dfb347af Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:28:29 +0800
Subject: [PATCH 3/7] fix: remove tab

---
 python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index ccf37507c09..fa4d1b3b0e7 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -77,7 +77,7 @@ def get_int_from_env(env_keys, default):
                                                  torch_dtype=torch.float16,
                                                  trust_remote_code=True,
                                                  use_cache=True)
-    
+
     model = deepspeed.init_inference(
         model,
         mp_size=world_size,

From 46d9d51d94430a9272131786cfc3d7f168082c57 Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:29:04 +0800
Subject: [PATCH 4/7] fix: remove extra blank

---
 python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index fa4d1b3b0e7..8746cb02717 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -137,7 +137,6 @@ def get_int_from_env(env_keys, default):
             output_str = tokenizer.decode(output[0], skip_special_tokens=True)
             avg_time = (end - st) / actual_output_len * 1000
             print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s')
-            
             print('-'*20, 'Prompt', '-'*20)
             print(prompt)
             print('-'*20, 'Output', '-'*20)

From fe2e1a9dd2ba1a0e9701e43fd8162a3f307be95a Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:30:39 +0800
Subject: [PATCH 5/7] small fix

---
 .../GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
index 37e53545545..2eb2f0180d7 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@@ -24,7 +24,8 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force  
+source /opt/intel/1ccl-wks/setvars.sh
 
 export OMP_NUM_THREADS=$((56/$NUM_GPUS))
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2

From 5ff4e23e12b9531885a48c561fb73485059ce455 Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Wed, 30 Oct 2024 10:32:55 +0800
Subject: [PATCH 6/7] add comments

---
 .../GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh      | 2 +-
 .../GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh | 2 +-
 .../llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh | 2 +-
 .../example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
index 2eb2f0180d7..5c3d8cacf4b 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_llama2_70b_pvc_1550_1_card.sh
@@ -24,7 +24,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-# source $basekit_root/ccl/latest/env/vars.sh --force  
+# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 source /opt/intel/1ccl-wks/setvars.sh
 
 export OMP_NUM_THREADS=$((56/$NUM_GPUS))
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
index f0a0e2042f8..833b7f348fc 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_mistral_7b_instruct_flex_2_card.sh
@@ -22,7 +22,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-# source $basekit_root/ccl/latest/env/vars.sh --force  
+# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
index 3dca705d110..2535622a4db 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_qwen_14b_arc_2_card.sh
@@ -25,7 +25,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-# source $basekit_root/ccl/latest/env/vars.sh --force  
+# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU
diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
index 21ab80a8cd4..293032ab991 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/run_vicuna_33b_arc_2_card.sh
@@ -22,7 +22,7 @@ export CCL_ZE_IPC_EXCHANGE=sockets
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so:${LD_PRELOAD}
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-# source $basekit_root/ccl/latest/env/vars.sh --force  
+# source $basekit_root/ccl/latest/env/vars.sh --force   deprecate oneccl_bind_pt and use internal oneccl for better performance
 source /opt/intel/1ccl-wks/setvars.sh
 
 NUM_GPUS=2 # number of used GPU

From 303d5c89914e5b8590adb9dc301f033cd11310f0 Mon Sep 17 00:00:00 2001
From: cranechu0131 <1340390339@qq.com>
Date: Thu, 31 Oct 2024 09:46:56 +0800
Subject: [PATCH 7/7] fix: add a blank space

---
 python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
index 8746cb02717..8df91b12905 100644
--- a/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
+++ b/python/llm/example/GPU/Deepspeed-AutoTP/deepspeed_autotp.py
@@ -136,7 +136,7 @@ def get_int_from_env(env_keys, default):
             actual_output_len = output.shape[1] - input_ids.shape[1]
             output_str = tokenizer.decode(output[0], skip_special_tokens=True)
             avg_time = (end - st) / actual_output_len * 1000
-            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s,first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s')
+            print(f'Inference time of generating {actual_output_len} tokens: {end-st} s, first token cost {model.first_cost} s, rest tokens average cost {model.rest_cost_mean} s')
             print('-'*20, 'Prompt', '-'*20)
             print(prompt)
             print('-'*20, 'Output', '-'*20)