diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index 544170195ff..e3e1993a9c0 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -164,12 +164,6 @@ jobs:
         shell: bash
         run: |
           pip install --upgrade datasets==2.14.6 
-          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-            pip install --upgrade transformers==4.36
-          else
-            pip install --upgrade transformers==4.31
-          fi
-      
 
       - name: Run harness
         shell: bash
diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index 7ad621f91e3..7c2037ff318 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -144,16 +144,11 @@ jobs:
           echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
           MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
           wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
-          
+
       - name: Upgrade packages
         shell: bash
         run: |
-          pip install --upgrade datasets==2.14.6 
-          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-            pip install --upgrade transformers==4.36
-          else
-            pip install --upgrade transformers==4.31
-          fi
+          pip install --upgrade datasets==2.14.6
 
       - name: Run perplexity
         shell: bash
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 07c200ecf14..73098d4dffa 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -87,12 +87,11 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
 
-      - name: Test on xpu(transformers==4.31.0)
+      - name: Test on xpu(transformers==4.36.2)
         shell: bash
         run: |
           date_for_test_version=$(date -d yesterday +%Y-%m-%d)
           sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py
-
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
@@ -104,20 +103,6 @@ jobs:
           sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
 
-      - name: Test on xpu(transformers==4.34.0)
-        shell: bash
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          export USE_XETLA=OFF
-          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          # upgrade transformers for model Mistral-7B-v0.1
-          python -m pip install transformers==4.34.0
-          cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
-          cd python/llm/dev/benchmark/all-in-one
-          # change csv name
-          sed -i 's/test1/test2/g' run.py
-          python run.py
-
       - name: Test on xpu(transformers==4.37.0)
         shell: bash
         run: |
@@ -129,7 +114,7 @@ jobs:
           cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           # change csv name
-          sed -i 's/test2/test3/g' run.py
+          sed -i 's/test1/test2/g' run.py
           python run.py
 
       - name: Concat csv and generate html
@@ -151,7 +136,7 @@ jobs:
         run: |
           cd python/llm/dev/benchmark/all-in-one
           python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
-          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
+          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
           find . -name "*test*.csv" -delete
           if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
             curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
@@ -279,6 +264,7 @@ jobs:
             exit 1
           fi
 
+
       - name: Test on core ${{ matrix.platform }}
         shell: bash
         run: |
@@ -325,8 +311,8 @@ jobs:
       # - name: Prepare for install ipex-llm from source
       #   shell: bash
       #   run: |
-      #     sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py
-      #     sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+      #     sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+      #     sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 
       # - name: Install ipex-llm and other related packages (install from source)
       #   shell: cmd
@@ -426,33 +412,10 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (32-32)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
-
-      - name: Test on igpu for Mistral (32-32)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
       - name: Prepare igpu perf test for Qwen1.5 (32-32)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml
 
       - name: Test on igpu for Qwen1.5 (32-32)
@@ -495,14 +458,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
 
       - name: Test on igpu (1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -517,33 +480,10 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
-
-      - name: Test on igpu for Mistral (1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (1024-128)
@@ -585,14 +525,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml
 
       - name: Test on igpu (2048-256)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -607,33 +547,10 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (2048-256)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
-
-      - name: Test on igpu for Mistral (2048-256)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (2048-256)
@@ -675,14 +592,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
 
       - name: Test on igpu (load_low_bit 1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -697,33 +614,10 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
-
-      - name: Test on igpu for Mistral (load_low_bit 1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128)
@@ -763,14 +657,14 @@ jobs:
       - name: Prepare igpu perf test (int4+fp16 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 
       - name: Test on igpu (int4+fp16 1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -785,33 +679,10 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
-
-      - name: Test on igpu for Mistral (int4+fp16 1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128)
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 6d3dd610006..d565be1347b 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -99,7 +99,7 @@ jobs:
           echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
           echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
           echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
-          echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV"
           echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
           echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
@@ -157,13 +157,13 @@ jobs:
           # fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"            
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
           fi
-          if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
-            echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR"
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR
+          if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
+            echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR
           fi
           if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
             echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
@@ -226,7 +226,7 @@ jobs:
         shell: bash
         run: |
           pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
-          pip install transformers==4.36.0
+          pip install transformers==4.36.2
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests.sh
       - name: Run sentence-transformers uninstallation
@@ -234,6 +234,7 @@ jobs:
         shell: bash
         run: |
           pip uninstall sentence-transformers -y || true
+
   llm-unit-test-on-arc:
     needs: [setup-python-version, llm-cpp-build]
     strategy:
@@ -364,8 +365,6 @@ jobs:
           fi
           python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
           bash python/llm/test/run-llm-inference-tests-gpu.sh
-          python -m pip install transformers==4.34.0 
-          bash python/llm/test/run-llm-inference-tests-gpu-434.sh
 
       - name: Run LLM example tests
         shell: bash
@@ -428,7 +427,7 @@ jobs:
             pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
             source /home/arda/intel/oneapi/setvars.sh
           fi
-          pip install transformers==4.36.0
+          pip install transformers==4.36.2
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
       - name: Run sentence-transformers uninstallation
diff --git a/python/llm/setup.py b/python/llm/setup.py
index e2a180c7f95..6cf022747f3 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -53,7 +53,7 @@
 
 cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"]
 CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
-               'transformers == 4.31.0', 'sentencepiece', 'tokenizers == 0.13.3',
+               'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2',
                # TODO: Support accelerate 0.22.0
                'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version
 
@@ -279,10 +279,9 @@ def setup_package():
 
     # Add internal requires for llama-index
     llama_index_requires = copy.deepcopy(all_requires)
-    for exclude_require in ['transformers == 4.31.0', 'tokenizers == 0.13.3'] + cpu_torch_version:
+    for exclude_require in cpu_torch_version:
         llama_index_requires.remove(exclude_require)
     llama_index_requires += ["torch<2.2.0",
-                             "transformers>=4.34.0,<4.39.0",
                              "sentence-transformers~=2.6.1"]
 
 
diff --git a/python/llm/src/ipex_llm/optimize.py b/python/llm/src/ipex_llm/optimize.py
index d69895ec2c2..86db591ca9b 100644
--- a/python/llm/src/ipex_llm/optimize.py
+++ b/python/llm/src/ipex_llm/optimize.py
@@ -47,7 +47,8 @@ def _save_low_bit(self, save_dir, *args, **kwargs):
     if isinstance(self, PreTrainedModel):
         # We borrowed this method to adapt to Transformer model cases
         # as much as possible, and later we may merge these two situations
-        self.save_pretrained(save_dir)
+        kwargs['safe_serialization'] = False
+        self.save_pretrained(save_dir, *args, **kwargs)
     else:
         # TODO: For the lowbit model still larger than 8GB,
         #       save it into shards.
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 47f74b20e7e..895588ce4e4 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -10,13 +10,14 @@ repo_id:
   - 'databricks/dolly-v1-6b'
   - 'databricks/dolly-v2-7b'
   - 'databricks/dolly-v2-12b'
-  - 'internlm/internlm-chat-7b-8k'
+  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat-7B'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
-  - 'fnlp/moss-moon-003-sft-4bit'
+#  - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+
+  - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
@@ -31,7 +32,7 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-  - 'fnlp/moss-moon-003-sft-4bit:1024'
-  - 'fnlp/moss-moon-003-sft-4bit:2048'
+#  - 'fnlp/moss-moon-003-sft-4bit:1024'
+#  - 'fnlp/moss-moon-003-sft-4bit:2048'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
diff --git a/python/llm/test/benchmark/arc-perf-transformers-434.yaml b/python/llm/test/benchmark/arc-perf-transformers-434.yaml
deleted file mode 100644
index 1389e44ab5a..00000000000
--- a/python/llm/test/benchmark/arc-perf-transformers-434.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# For the models that require transformers 4.34.0
-repo_id:
-  - 'mistralai/Mistral-7B-v0.1'
-local_model_hub: '/mnt/disk1/models'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index df27dde503d..5584aba3413 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -12,10 +12,11 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
deleted file mode 100644
index b4b1e9b7a4f..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 7425cd45306..a073c5cb77c 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -12,10 +12,11 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   # - 'RWKV/rwkv-4-world-7b'
   # - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
deleted file mode 100644
index 57f0a3d3c8e..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index 1afe8567600..fee01274064 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -12,10 +12,11 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
deleted file mode 100644
index 51453bd1b6a..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index 0fabc75e580..7e64f188964 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -12,10 +12,11 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
deleted file mode 100644
index b16e5493017..00000000000
--- a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index 681c8a69ce5..20f6cb7b571 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -12,10 +12,11 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5
diff --git a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml b/python/llm/test/benchmark/igpu-perf/32-32_434.yaml
deleted file mode 100644
index 6b5c4229b54..00000000000
--- a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 3
-num_trials: 5
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
index aa9158bdd13..92b12750dbb 100644
--- a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
@@ -6,7 +6,7 @@ repo_id:
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
-local_model_hub: '/models'
+local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
diff --git a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
index 38aeb375910..f8c75489659 100644
--- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
@@ -6,7 +6,7 @@ repo_id:
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
-local_model_hub: '/models'
+local_model_hub: '/mnt/disk1/models'
 warm_up: 3
 num_trials: 50
 num_beams: 1 # default to greedy search
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index 1a72801cc1a..f16773c62c3 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -49,16 +49,16 @@ def test_transformers_auto_model_int4(self):
         print('Prompt:', input_str)
         print('Output:', output_str)
         print(f'Inference time: {end-st} s')
-        res = 'Paris' in output_str        
+        res = 'Paris' in output_str
         self.assertTrue(res)
 
     def test_transformers_auto_model_for_causal_lm_int4(self):
-        model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
+        model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         input_str = 'def hello():\n  print("hello world")\n'
         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
         with torch.inference_mode():
-            
+
             st = time.time()
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
             output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
@@ -67,7 +67,7 @@ def test_transformers_auto_model_for_causal_lm_int4(self):
         print('Prompt:', input_str)
         print('Output:', output_str)
         print(f'Inference time: {end-st} s')
-        res = '\nhello()' in output_str        
+        res = '\nhello()' in output_str
         self.assertTrue(res)
         
 
@@ -86,7 +86,7 @@ def test_transformers_auto_model_for_speech_seq2seq_int4(self):
             predicted_ids = model.generate(input_features)
             # decode token ids to text
             transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
-            end = time.time()        
+            end = time.time()
         print('Output:', transcription)
         print(f'Inference time: {end-st} s')
         res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0]
@@ -108,7 +108,7 @@ def test_transformers_chatglm_for_causallm(self):
         print('Prompt:', input_str)
         print('Output:', output_str)
         print(f'Inference time: {end-st} s')
-        res = 'Paris' in output_str        
+        res = 'Paris' in output_str
         self.assertTrue(res)
 
 @pytest.mark.parametrize('prompt, answer', [
@@ -116,6 +116,7 @@ def test_transformers_chatglm_for_causallm(self):
     ])
 @pytest.mark.parametrize('Model, Tokenizer, model_path',[
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
     ])
 def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -123,7 +124,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
                                   load_in_4bit=True,
                                   optimize_model=True,
                                   trust_remote_code=True)
-    
+
     with tempfile.TemporaryDirectory() as tempdir:
         model.save_low_bit(tempdir)
         loaded_model = Model.load_low_bit(tempdir,
@@ -143,9 +144,10 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
-    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt)
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
-    
+
 def test_optimize_model(Model, Tokenizer, model_path, prompt):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
diff --git a/python/llm/test/inference/test_transformesr_api_434.py b/python/llm/test/inference/test_transformesr_api_434.py
deleted file mode 100644
index 4de49e660ae..00000000000
--- a/python/llm/test/inference/test_transformesr_api_434.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import pytest
-import tempfile
-import torch
-
-from ipex_llm.transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-
-mistral_model_path = os.environ.get('MISTRAL_ORIGIN_PATH')
-
-prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
-
-@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
-    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path, prompt)
-])
-    
-def test_optimize_model(Model, Tokenizer, model_path, prompt):
-    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
-    input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-    model = Model.from_pretrained(model_path,
-                                load_in_4bit=True,
-                                optimize_model=False,
-                                trust_remote_code=True)
-    logits_base_model = (model(input_ids)).logits
-
-    model = Model.from_pretrained(model_path,
-                                load_in_4bit=True,
-                                optimize_model=True,
-                                trust_remote_code=True)
-    logits_optimized_model = (model(input_ids)).logits
-    diff = abs(logits_base_model - logits_optimized_model).flatten()
-
-    assert any(diff) is False
-
-@pytest.mark.parametrize('prompt, answer', [
-    ('What is the capital of France?\n\n', 'Paris')
-    ])
-@pytest.mark.parametrize('Model, Tokenizer, model_path',[
-    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path),
-    ])
-def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
-    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = Model.from_pretrained(model_path,
-                                  load_in_4bit=True,
-                                  optimize_model=True,
-                                  trust_remote_code=True)
-    
-    with tempfile.TemporaryDirectory() as tempdir:
-        model.save_low_bit(tempdir)
-        loaded_model = Model.load_low_bit(tempdir,
-                                          optimize_model=True,
-                                          trust_remote_code=True)
-
-        with torch.inference_mode():
-            input_ids = tokenizer.encode(prompt, return_tensors="pt")
-            output = loaded_model.generate(input_ids, max_new_tokens=32)
-            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
-
-            assert answer in output_str
-
-if __name__ == '__main__':
-    pytest.main([__file__])
diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py
index b03ddaf9d2e..149d81a34c8 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_attention.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py
@@ -104,8 +104,8 @@ def replace_forward_hook(module, input, output, layer_name):
                     if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
                         # 'attn_output' is of type torch.Tensor.
                         attn_output_diff.append(t1 - t2)
-                    else:
-                        # 'past_key_value'is of type tuple as default.
+                    elif isinstance(t1, tuple) and isinstance(t2, tuple):
+                        # if 'past_key_value'is of type tuple
                         for i, (t3, t4) in enumerate(zip(t1, t2)):
                             if model.config.architectures[0] == "ChatGLMModel" and \
                                     hasattr(model.config, 'padded_vocab_size') and \
@@ -114,6 +114,10 @@ def replace_forward_hook(module, input, output, layer_name):
                                 # We need to narrow it here.
                                 t4 = t4[:, :, 15:17, :]
                             attn_output_diff.append(t3 - t4)
+                    else:
+                        # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
+                        attn_output_diff.append(t1[-1][0] - t2[-1][0])
+                        attn_output_diff.append(t1[-1][1] - t2[-1][1])
 
             max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
             print(max_diff_tensor)
diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index e3273ad574e..70ba2e7b9f6 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -96,9 +96,14 @@ def replace_forward_hook(module, input, output, layer_name):
             for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
                 if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
                     MLP_output_diff.append(t1 - t2)
-                else:
+                elif isinstance(t1, tuple) and isinstance(t2, tuple):
+                    # if 'past_key_value'is of type tuple
                     for i, (t3, t4) in enumerate(zip(t1, t2)):
                         MLP_output_diff.append(t3 - t4)
+                else:
+                    # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
+                    MLP_output_diff.append(t1[-1][0] - t2[-1][0])
+                    MLP_output_diff.append(t1[-1][1] - t2[-1][1])
 
             max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff]
             print(max_diff_tensor)
diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py
index cbaaa1e0ba7..ad139c74dc6 100644
--- a/python/llm/test/langchain/test_transformers_api.py
+++ b/python/llm/test/langchain/test_transformers_api.py
@@ -38,7 +38,7 @@
 class Test_Langchain_Transformers_API(TestCase):
     def setUp(self):
         self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
-        self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
+        self.auto_causal_model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
         self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
         self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
         thread_num = os.environ.get('THREAD_NUM')
@@ -79,12 +79,12 @@ def test_transformers_llama_embeddings(self):
 
     def test_qa_chain(self):
         texts = '''
-            AI is a machine’s ability to perform the cognitive functions 
-            we associate with human minds, such as perceiving, reasoning, 
+            AI is a machine’s ability to perform the cognitive functions
+            we associate with human minds, such as perceiving, reasoning,
             learning, interacting with an environment, problem solving,
-            and even exercising creativity. You’ve probably interacted 
-            with AI even if you didn’t realize it—voice assistants like Siri 
-            and Alexa are founded on AI technology, as are some customer 
+            and even exercising creativity. You’ve probably interacted
+            with AI even if you didn’t realize it—voice assistants like Siri
+            and Alexa are founded on AI technology, as are some customer
             service chatbots that pop up to help you navigate websites.
             '''
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@@ -102,16 +102,16 @@ def test_qa_chain(self):
         res = "AI" in output
         self.assertTrue(res)
 
-    
+
     """
     def test_qa_chain_causalLM(self):
         texts = '''
-            AI is a machine’s ability to perform the cognitive functions 
-            we associate with human minds, such as perceiving, reasoning, 
+            AI is a machine’s ability to perform the cognitive functions
+            we associate with human minds, such as perceiving, reasoning,
             learning, interacting with an environment, problem solving,
-            and even exercising creativity. You’ve probably interacted 
-            with AI even if you didn’t realize it—voice assistants like Siri 
-            and Alexa are founded on AI technology, as are some customer 
+            and even exercising creativity. You’ve probably interacted
+            with AI even if you didn’t realize it—voice assistants like Siri
+            and Alexa are founded on AI technology, as are some customer
             service chatbots that pop up to help you navigate websites.
             '''
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@@ -129,7 +129,7 @@ def test_qa_chain_causalLM(self):
         res = "AI" in output
         self.assertTrue(res)
     """
-    
+
     def test_embed_kwargs(self):
         embeddings = TransformersEmbeddings.from_model_id(model_id=self.llama_model_path)
         encode_kwargs =  {"truncation": True, "max_length": 512}
diff --git a/python/llm/test/run-llm-inference-tests-gpu-434.sh b/python/llm/test/run-llm-inference-tests-gpu-434.sh
deleted file mode 100644
index 91a1676ddf8..00000000000
--- a/python/llm/test/run-llm-inference-tests-gpu-434.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
-export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
-export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
-
-export USE_XETLA=OFF
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export DEVICE='xpu'
-
-set -e
-
-echo "# Start testing inference"
-start=$(date "+%s")
-
-# if [ -z "$THREAD_NUM" ]; then
-#   THREAD_NUM=2
-# fi
-# export OMP_NUM_THREADS=$THREAD_NUM
-export BIGDL_LLM_XMX_DISABLED=1
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "Mistral"
-unset BIGDL_LLM_XMX_DISABLED
-
-now=$(date "+%s")
-time=$((now-start))
-
-echo "Bigdl-llm gpu inference tests for transformers 4.34.0 finished"
-echo "Time used:$time seconds"
diff --git a/python/llm/test/run-llm-inference-tests-gpu.sh b/python/llm/test/run-llm-inference-tests-gpu.sh
index ea1abb519f4..5e48c0df876 100644
--- a/python/llm/test/run-llm-inference-tests-gpu.sh
+++ b/python/llm/test/run-llm-inference-tests-gpu.sh
@@ -21,9 +21,9 @@ pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s
 pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
 export BIGDL_LLM_XMX_DISABLED=1
 pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "not Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "not Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "not Mistral"
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s
 unset BIGDL_LLM_XMX_DISABLED
 
 now=$(date "+%s")
diff --git a/python/llm/test/run-llm-inference-tests.sh b/python/llm/test/run-llm-inference-tests.sh
index e53528dbb56..d3c3c0690ef 100644
--- a/python/llm/test/run-llm-inference-tests.sh
+++ b/python/llm/test/run-llm-inference-tests.sh
@@ -18,10 +18,6 @@ export OMP_NUM_THREADS=$THREAD_NUM
 python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
 python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
 
-python -m pip install transformers==4.34.0
-python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformesr_api_434.py -v
-python -m pip install transformers==4.31.0
-
 now=$(date "+%s")
 time=$((now-start))