diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index 544170195ff..e3e1993a9c0 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -164,12 +164,6 @@ jobs: shell: bash run: | pip install --upgrade datasets==2.14.6 - if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then - pip install --upgrade transformers==4.36 - else - pip install --upgrade transformers==4.31 - fi - - name: Run harness shell: bash diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml index 7ad621f91e3..7c2037ff318 100644 --- a/.github/workflows/llm-ppl-evaluation.yml +++ b/.github/workflows/llm-ppl-evaluation.yml @@ -144,16 +144,11 @@ jobs: echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV" MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/ wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR} - + - name: Upgrade packages shell: bash run: | - pip install --upgrade datasets==2.14.6 - if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then - pip install --upgrade transformers==4.36 - else - pip install --upgrade transformers==4.31 - fi + pip install --upgrade datasets==2.14.6 - name: Run perplexity shell: bash diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 07c200ecf14..73098d4dffa 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -87,12 +87,11 @@ jobs: source /opt/intel/oneapi/setvars.sh bash python/llm/test/run-llm-install-tests.sh - - name: Test on xpu(transformers==4.31.0) + - name: Test on xpu(transformers==4.36.2) shell: bash run: | date_for_test_version=$(date -d yesterday +%Y-%m-%d) sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py - source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 @@ -104,20 +103,6 @@ jobs: sed -i 's/{today}/{today}_test1/g' run.py python run.py - - name: Test on xpu(transformers==4.34.0) - shell: bash - run: | - source /opt/intel/oneapi/setvars.sh - export USE_XETLA=OFF - export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 - # upgrade transformers for model Mistral-7B-v0.1 - python -m pip install transformers==4.34.0 - cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml - cd python/llm/dev/benchmark/all-in-one - # change csv name - sed -i 's/test1/test2/g' run.py - python run.py - - name: Test on xpu(transformers==4.37.0) shell: bash run: | @@ -129,7 +114,7 @@ jobs: cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # change csv name - sed -i 's/test2/test3/g' run.py + sed -i 's/test1/test2/g' run.py python run.py - name: Concat csv and generate html @@ -151,7 +136,7 @@ jobs: run: | cd python/llm/dev/benchmark/all-in-one python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml - python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml + python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml find . -name "*test*.csv" -delete if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ @@ -279,6 +264,7 @@ jobs: exit 1 fi + - name: Test on core ${{ matrix.platform }} shell: bash run: | @@ -325,8 +311,8 @@ jobs: # - name: Prepare for install ipex-llm from source # shell: bash # run: | - # sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py - # sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py + # sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py + # sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py # - name: Install ipex-llm and other related packages (install from source) # shell: cmd @@ -426,33 +412,10 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Mistral (32-32) - shell: bash - run: | - sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml - - - name: Test on igpu for Mistral (32-32) - shell: cmd - run: | - call conda activate igpu-perf - pip install transformers==4.34.0 - - set SYCL_CACHE_PERSISTENT=1 - set BIGDL_LLM_XMX_DISABLED=1 - - cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml - set PYTHONIOENCODING=utf-8 - python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1 - if %ERRORLEVEL% neq 0 (exit /b 1) - - call conda deactivate - - name: Prepare igpu perf test for Qwen1.5 (32-32) shell: bash run: | - sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml - name: Test on igpu for Qwen1.5 (32-32) @@ -495,14 +458,14 @@ jobs: shell: bash run: | sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml - name: Test on igpu (1024-128) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.31.0 + pip install transformers==4.36.2 set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -517,33 +480,10 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Mistral (1024-128) - shell: bash - run: | - sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml - - - name: Test on igpu for Mistral (1024-128) - shell: cmd - run: | - call conda activate igpu-perf - pip install transformers==4.34.0 - - set SYCL_CACHE_PERSISTENT=1 - set BIGDL_LLM_XMX_DISABLED=1 - - cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml - set PYTHONIOENCODING=utf-8 - python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1 - if %ERRORLEVEL% neq 0 (exit /b 1) - - call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (1024-128) shell: bash run: | - sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml - name: Test on igpu for Qwen 1.5 (1024-128) @@ -585,14 +525,14 @@ jobs: shell: bash run: | sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml - name: Test on igpu (2048-256) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.31.0 + pip install transformers==4.36.2 set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -607,33 +547,10 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Mistral (2048-256) - shell: bash - run: | - sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml - - - name: Test on igpu for Mistral (2048-256) - shell: cmd - run: | - call conda activate igpu-perf - pip install transformers==4.34.0 - - set SYCL_CACHE_PERSISTENT=1 - set BIGDL_LLM_XMX_DISABLED=1 - - cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml - set PYTHONIOENCODING=utf-8 - python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1 - if %ERRORLEVEL% neq 0 (exit /b 1) - - call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (2048-256) shell: bash run: | - sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml - name: Test on igpu for Qwen 1.5 (2048-256) @@ -675,14 +592,14 @@ jobs: shell: bash run: | sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml - name: Test on igpu (load_low_bit 1024-128) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.31.0 + pip install transformers==4.36.2 set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -697,33 +614,10 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Mistral (load_low_bit 1024-128) - shell: bash - run: | - sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml - - - name: Test on igpu for Mistral (load_low_bit 1024-128) - shell: cmd - run: | - call conda activate igpu-perf - pip install transformers==4.34.0 - - set SYCL_CACHE_PERSISTENT=1 - set BIGDL_LLM_XMX_DISABLED=1 - - cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml - set PYTHONIOENCODING=utf-8 - python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1 - if %ERRORLEVEL% neq 0 (exit /b 1) - - call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128) shell: bash run: | - sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128) @@ -763,14 +657,14 @@ jobs: - name: Prepare igpu perf test (int4+fp16 1024-128) shell: bash run: | - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml - name: Test on igpu (int4+fp16 1024-128) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.31.0 + pip install transformers==4.36.2 set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -785,33 +679,10 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Mistral (int4+fp16 1024-128) - shell: bash - run: | - sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml - - - name: Test on igpu for Mistral (int4+fp16 1024-128) - shell: cmd - run: | - call conda activate igpu-perf - pip install transformers==4.34.0 - - set SYCL_CACHE_PERSISTENT=1 - set BIGDL_LLM_XMX_DISABLED=1 - - cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml - set PYTHONIOENCODING=utf-8 - python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1 - if %ERRORLEVEL% neq 0 (exit /b 1) - - call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128) shell: bash run: | - sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128) diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml index 6d3dd610006..d565be1347b 100644 --- a/.github/workflows/llm_unit_tests.yml +++ b/.github/workflows/llm_unit_tests.yml @@ -99,7 +99,7 @@ jobs: echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV" echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV" echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV" - echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV" + echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV" echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV" echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV" echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV" @@ -157,13 +157,13 @@ jobs: # fi if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..." - echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR" + echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR" wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR fi - if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then - echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..." - echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR" - wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR + if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then + echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..." + echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR" + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR fi if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..." @@ -226,7 +226,7 @@ jobs: shell: bash run: | pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface - pip install transformers==4.36.0 + pip install transformers==4.36.2 pip install "pydantic>=2.0.0" bash python/llm/test/run-llm-llamaindex-tests.sh - name: Run sentence-transformers uninstallation @@ -234,6 +234,7 @@ jobs: shell: bash run: | pip uninstall sentence-transformers -y || true + llm-unit-test-on-arc: needs: [setup-python-version, llm-cpp-build] strategy: @@ -364,8 +365,6 @@ jobs: fi python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator bash python/llm/test/run-llm-inference-tests-gpu.sh - python -m pip install transformers==4.34.0 - bash python/llm/test/run-llm-inference-tests-gpu-434.sh - name: Run LLM example tests shell: bash @@ -428,7 +427,7 @@ jobs: pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ source /home/arda/intel/oneapi/setvars.sh fi - pip install transformers==4.36.0 + pip install transformers==4.36.2 pip install "pydantic>=2.0.0" bash python/llm/test/run-llm-llamaindex-tests-gpu.sh - name: Run sentence-transformers uninstallation diff --git a/python/llm/setup.py b/python/llm/setup.py index e2a180c7f95..6cf022747f3 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -53,7 +53,7 @@ cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"] CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error - 'transformers == 4.31.0', 'sentencepiece', 'tokenizers == 0.13.3', + 'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2', # TODO: Support accelerate 0.22.0 'accelerate == 0.21.0', 'tabulate'] + cpu_torch_version @@ -279,10 +279,9 @@ def setup_package(): # Add internal requires for llama-index llama_index_requires = copy.deepcopy(all_requires) - for exclude_require in ['transformers == 4.31.0', 'tokenizers == 0.13.3'] + cpu_torch_version: + for exclude_require in cpu_torch_version: llama_index_requires.remove(exclude_require) llama_index_requires += ["torch<2.2.0", - "transformers>=4.34.0,<4.39.0", "sentence-transformers~=2.6.1"] diff --git a/python/llm/src/ipex_llm/optimize.py b/python/llm/src/ipex_llm/optimize.py index d69895ec2c2..86db591ca9b 100644 --- a/python/llm/src/ipex_llm/optimize.py +++ b/python/llm/src/ipex_llm/optimize.py @@ -47,7 +47,8 @@ def _save_low_bit(self, save_dir, *args, **kwargs): if isinstance(self, PreTrainedModel): # We borrowed this method to adapt to Transformer model cases # as much as possible, and later we may merge these two situations - self.save_pretrained(save_dir) + kwargs['safe_serialization'] = False + self.save_pretrained(save_dir, *args, **kwargs) else: # TODO: For the lowbit model still larger than 8GB, # save it into shards. diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index 47f74b20e7e..895588ce4e4 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -10,13 +10,14 @@ repo_id: - 'databricks/dolly-v1-6b' - 'databricks/dolly-v2-7b' - 'databricks/dolly-v2-12b' - - 'internlm/internlm-chat-7b-8k' + - 'internlm/internlm-chat-7b' - 'Qwen/Qwen-7B-Chat' - 'BAAI/AquilaChat-7B' - 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat-4bit' - 'bigscience/bloomz-7b1' - - 'fnlp/moss-moon-003-sft-4bit' +# - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+ + - 'mistralai/Mistral-7B-v0.1' local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 @@ -31,7 +32,7 @@ test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) exclude: - - 'fnlp/moss-moon-003-sft-4bit:1024' - - 'fnlp/moss-moon-003-sft-4bit:2048' +# - 'fnlp/moss-moon-003-sft-4bit:1024' +# - 'fnlp/moss-moon-003-sft-4bit:2048' - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' - 'bigscience/bloomz-7b1:2048' diff --git a/python/llm/test/benchmark/arc-perf-transformers-434.yaml b/python/llm/test/benchmark/arc-perf-transformers-434.yaml deleted file mode 100644 index 1389e44ab5a..00000000000 --- a/python/llm/test/benchmark/arc-perf-transformers-434.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# For the models that require transformers 4.34.0 -repo_id: - - 'mistralai/Mistral-7B-v0.1' -local_model_hub: '/mnt/disk1/models' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '32-32' - - '1024-128' - - '2048-256' -test_api: - - "transformer_int4_gpu" # on Intel GPU -cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml index df27dde503d..5584aba3413 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml @@ -12,10 +12,11 @@ repo_id: - 'WisdomShell/CodeShell-7B-Chat' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - - 'liuhaotian/llava-v1.5-7b' +# - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+ - 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-5-world-7b' - 'IEITYuan/Yuan2-2B-hf' + - 'mistralai/Mistral-7B-Instruct-v0.1' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml deleted file mode 100644 index b4b1e9b7a4f..00000000000 --- a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml +++ /dev/null @@ -1,13 +0,0 @@ -repo_id: - - 'mistralai/Mistral-7B-Instruct-v0.1' -local_model_hub: 'path to your local model hub' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '1024-128' -test_api: - - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) -cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml index 7425cd45306..a073c5cb77c 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml @@ -12,10 +12,11 @@ repo_id: - 'WisdomShell/CodeShell-7B-Chat' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - - 'liuhaotian/llava-v1.5-7b' +# - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+ # - 'RWKV/rwkv-4-world-7b' # - 'RWKV/rwkv-5-world-7b' - 'IEITYuan/Yuan2-2B-hf' + - 'mistralai/Mistral-7B-Instruct-v0.1' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml deleted file mode 100644 index 57f0a3d3c8e..00000000000 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml +++ /dev/null @@ -1,13 +0,0 @@ -repo_id: - - 'mistralai/Mistral-7B-Instruct-v0.1' -local_model_hub: 'path to your local model hub' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '1024-128' -test_api: - - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer -cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml index 1afe8567600..fee01274064 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml @@ -12,10 +12,11 @@ repo_id: - 'WisdomShell/CodeShell-7B-Chat' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - - 'liuhaotian/llava-v1.5-7b' +# - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+ - 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-5-world-7b' - 'IEITYuan/Yuan2-2B-hf' + - 'mistralai/Mistral-7B-Instruct-v0.1' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml deleted file mode 100644 index 51453bd1b6a..00000000000 --- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml +++ /dev/null @@ -1,13 +0,0 @@ -repo_id: - - 'mistralai/Mistral-7B-Instruct-v0.1' -local_model_hub: 'path to your local model hub' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '1024-128' -test_api: - - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) -cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml index 0fabc75e580..7e64f188964 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml @@ -12,10 +12,11 @@ repo_id: - 'WisdomShell/CodeShell-7B-Chat' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - - 'liuhaotian/llava-v1.5-7b' +# - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+ - 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-5-world-7b' - 'IEITYuan/Yuan2-2B-hf' + - 'mistralai/Mistral-7B-Instruct-v0.1' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml deleted file mode 100644 index b16e5493017..00000000000 --- a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml +++ /dev/null @@ -1,13 +0,0 @@ -repo_id: - - 'mistralai/Mistral-7B-Instruct-v0.1' -local_model_hub: 'path to your local model hub' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '2048-256' -test_api: - - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) -cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml index 681c8a69ce5..20f6cb7b571 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml @@ -12,10 +12,11 @@ repo_id: - 'WisdomShell/CodeShell-7B-Chat' - 'tiiuae/falcon-7b-instruct-with-patch' - 'mosaicml/mpt-7b-chat' - - 'liuhaotian/llava-v1.5-7b' +# - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+ - 'RWKV/rwkv-4-world-7b' - 'RWKV/rwkv-5-world-7b' - 'IEITYuan/Yuan2-2B-hf' + - 'mistralai/Mistral-7B-Instruct-v0.1' local_model_hub: 'path to your local model hub' warm_up: 3 num_trials: 5 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml b/python/llm/test/benchmark/igpu-perf/32-32_434.yaml deleted file mode 100644 index 6b5c4229b54..00000000000 --- a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml +++ /dev/null @@ -1,13 +0,0 @@ -repo_id: - - 'mistralai/Mistral-7B-Instruct-v0.1' -local_model_hub: 'path to your local model hub' -warm_up: 3 -num_trials: 5 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '32-32' -test_api: - - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) -cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml index aa9158bdd13..92b12750dbb 100644 --- a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml +++ b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml @@ -6,7 +6,7 @@ repo_id: - 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat' - 'Qwen/Qwen-14B-Chat' -local_model_hub: '/models' +local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 num_beams: 1 # default to greedy search diff --git a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml index 38aeb375910..f8c75489659 100644 --- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml +++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml @@ -6,7 +6,7 @@ repo_id: - 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat' - 'Qwen/Qwen-14B-Chat' -local_model_hub: '/models' +local_model_hub: '/mnt/disk1/models' warm_up: 3 num_trials: 50 num_beams: 1 # default to greedy search diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py index 1a72801cc1a..f16773c62c3 100644 --- a/python/llm/test/inference/test_transformers_api.py +++ b/python/llm/test/inference/test_transformers_api.py @@ -49,16 +49,16 @@ def test_transformers_auto_model_int4(self): print('Prompt:', input_str) print('Output:', output_str) print(f'Inference time: {end-st} s') - res = 'Paris' in output_str + res = 'Paris' in output_str self.assertTrue(res) def test_transformers_auto_model_for_causal_lm_int4(self): - model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH') + model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) input_str = 'def hello():\n print("hello world")\n' model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True) with torch.inference_mode(): - + st = time.time() input_ids = tokenizer.encode(input_str, return_tensors="pt") output = model.generate(input_ids, do_sample=False, max_new_tokens=32) @@ -67,7 +67,7 @@ def test_transformers_auto_model_for_causal_lm_int4(self): print('Prompt:', input_str) print('Output:', output_str) print(f'Inference time: {end-st} s') - res = '\nhello()' in output_str + res = '\nhello()' in output_str self.assertTrue(res) @@ -86,7 +86,7 @@ def test_transformers_auto_model_for_speech_seq2seq_int4(self): predicted_ids = model.generate(input_features) # decode token ids to text transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) - end = time.time() + end = time.time() print('Output:', transcription) print(f'Inference time: {end-st} s') res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0] @@ -108,7 +108,7 @@ def test_transformers_chatglm_for_causallm(self): print('Prompt:', input_str) print('Output:', output_str) print(f'Inference time: {end-st} s') - res = 'Paris' in output_str + res = 'Paris' in output_str self.assertTrue(res) @pytest.mark.parametrize('prompt, answer', [ @@ -116,6 +116,7 @@ def test_transformers_chatglm_for_causallm(self): ]) @pytest.mark.parametrize('Model, Tokenizer, model_path',[ (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')), + (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')), ]) def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer): tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -123,7 +124,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer): load_in_4bit=True, optimize_model=True, trust_remote_code=True) - + with tempfile.TemporaryDirectory() as tempdir: model.save_low_bit(tempdir) loaded_model = Model.load_low_bit(tempdir, @@ -143,9 +144,10 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer): (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt), (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt), (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt), - (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt) + (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt), + (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt) ]) - + def test_optimize_model(Model, Tokenizer, model_path, prompt): tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) input_ids = tokenizer.encode(prompt, return_tensors="pt") diff --git a/python/llm/test/inference/test_transformesr_api_434.py b/python/llm/test/inference/test_transformesr_api_434.py deleted file mode 100644 index 4de49e660ae..00000000000 --- a/python/llm/test/inference/test_transformesr_api_434.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import pytest -import tempfile -import torch - -from ipex_llm.transformers import AutoModelForCausalLM -from transformers import AutoTokenizer - - -mistral_model_path = os.environ.get('MISTRAL_ORIGIN_PATH') - -prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" - -@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [ - (AutoModelForCausalLM, AutoTokenizer, mistral_model_path, prompt) -]) - -def test_optimize_model(Model, Tokenizer, model_path, prompt): - tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) - input_ids = tokenizer.encode(prompt, return_tensors="pt") - - model = Model.from_pretrained(model_path, - load_in_4bit=True, - optimize_model=False, - trust_remote_code=True) - logits_base_model = (model(input_ids)).logits - - model = Model.from_pretrained(model_path, - load_in_4bit=True, - optimize_model=True, - trust_remote_code=True) - logits_optimized_model = (model(input_ids)).logits - diff = abs(logits_base_model - logits_optimized_model).flatten() - - assert any(diff) is False - -@pytest.mark.parametrize('prompt, answer', [ - ('What is the capital of France?\n\n', 'Paris') - ]) -@pytest.mark.parametrize('Model, Tokenizer, model_path',[ - (AutoModelForCausalLM, AutoTokenizer, mistral_model_path), - ]) -def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer): - tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True) - model = Model.from_pretrained(model_path, - load_in_4bit=True, - optimize_model=True, - trust_remote_code=True) - - with tempfile.TemporaryDirectory() as tempdir: - model.save_low_bit(tempdir) - loaded_model = Model.load_low_bit(tempdir, - optimize_model=True, - trust_remote_code=True) - - with torch.inference_mode(): - input_ids = tokenizer.encode(prompt, return_tensors="pt") - output = loaded_model.generate(input_ids, max_new_tokens=32) - output_str = tokenizer.decode(output[0], skip_special_tokens=True) - - assert answer in output_str - -if __name__ == '__main__': - pytest.main([__file__]) diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py index b03ddaf9d2e..149d81a34c8 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_attention.py +++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py @@ -104,8 +104,8 @@ def replace_forward_hook(module, input, output, layer_name): if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor): # 'attn_output' is of type torch.Tensor. attn_output_diff.append(t1 - t2) - else: - # 'past_key_value'is of type tuple as default. + elif isinstance(t1, tuple) and isinstance(t2, tuple): + # if 'past_key_value'is of type tuple for i, (t3, t4) in enumerate(zip(t1, t2)): if model.config.architectures[0] == "ChatGLMModel" and \ hasattr(model.config, 'padded_vocab_size') and \ @@ -114,6 +114,10 @@ def replace_forward_hook(module, input, output, layer_name): # We need to narrow it here. t4 = t4[:, :, 15:17, :] attn_output_diff.append(t3 - t4) + else: + # if 'past_key_value'is of type Cache, get last layer cache pair (key, value) + attn_output_diff.append(t1[-1][0] - t2[-1][0]) + attn_output_diff.append(t1[-1][1] - t2[-1][1]) max_diff_tensor = [torch.max(item).item() for item in attn_output_diff] print(max_diff_tensor) diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py index e3273ad574e..70ba2e7b9f6 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py +++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py @@ -96,9 +96,14 @@ def replace_forward_hook(module, input, output, layer_name): for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)): if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor): MLP_output_diff.append(t1 - t2) - else: + elif isinstance(t1, tuple) and isinstance(t2, tuple): + # if 'past_key_value'is of type tuple for i, (t3, t4) in enumerate(zip(t1, t2)): MLP_output_diff.append(t3 - t4) + else: + # if 'past_key_value'is of type Cache, get last layer cache pair (key, value) + MLP_output_diff.append(t1[-1][0] - t2[-1][0]) + MLP_output_diff.append(t1[-1][1] - t2[-1][1]) max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff] print(max_diff_tensor) diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py index cbaaa1e0ba7..ad139c74dc6 100644 --- a/python/llm/test/langchain/test_transformers_api.py +++ b/python/llm/test/langchain/test_transformers_api.py @@ -38,7 +38,7 @@ class Test_Langchain_Transformers_API(TestCase): def setUp(self): self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH') - self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH') + self.auto_causal_model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH') self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH') self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH') thread_num = os.environ.get('THREAD_NUM') @@ -79,12 +79,12 @@ def test_transformers_llama_embeddings(self): def test_qa_chain(self): texts = ''' - AI is a machine’s ability to perform the cognitive functions - we associate with human minds, such as perceiving, reasoning, + AI is a machine’s ability to perform the cognitive functions + we associate with human minds, such as perceiving, reasoning, learning, interacting with an environment, problem solving, - and even exercising creativity. You’ve probably interacted - with AI even if you didn’t realize it—voice assistants like Siri - and Alexa are founded on AI technology, as are some customer + and even exercising creativity. You’ve probably interacted + with AI even if you didn’t realize it—voice assistants like Siri + and Alexa are founded on AI technology, as are some customer service chatbots that pop up to help you navigate websites. ''' text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) @@ -102,16 +102,16 @@ def test_qa_chain(self): res = "AI" in output self.assertTrue(res) - + """ def test_qa_chain_causalLM(self): texts = ''' - AI is a machine’s ability to perform the cognitive functions - we associate with human minds, such as perceiving, reasoning, + AI is a machine’s ability to perform the cognitive functions + we associate with human minds, such as perceiving, reasoning, learning, interacting with an environment, problem solving, - and even exercising creativity. You’ve probably interacted - with AI even if you didn’t realize it—voice assistants like Siri - and Alexa are founded on AI technology, as are some customer + and even exercising creativity. You’ve probably interacted + with AI even if you didn’t realize it—voice assistants like Siri + and Alexa are founded on AI technology, as are some customer service chatbots that pop up to help you navigate websites. ''' text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) @@ -129,7 +129,7 @@ def test_qa_chain_causalLM(self): res = "AI" in output self.assertTrue(res) """ - + def test_embed_kwargs(self): embeddings = TransformersEmbeddings.from_model_id(model_id=self.llama_model_path) encode_kwargs = {"truncation": True, "max_length": 512} diff --git a/python/llm/test/run-llm-inference-tests-gpu-434.sh b/python/llm/test/run-llm-inference-tests-gpu-434.sh deleted file mode 100644 index 91a1676ddf8..00000000000 --- a/python/llm/test/run-llm-inference-tests-gpu-434.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT} -export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src -export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu - -export USE_XETLA=OFF -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export DEVICE='xpu' - -set -e - -echo "# Start testing inference" -start=$(date "+%s") - -# if [ -z "$THREAD_NUM" ]; then -# THREAD_NUM=2 -# fi -# export OMP_NUM_THREADS=$THREAD_NUM -export BIGDL_LLM_XMX_DISABLED=1 -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "Mistral" -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "Mistral" -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "Mistral" -unset BIGDL_LLM_XMX_DISABLED - -now=$(date "+%s") -time=$((now-start)) - -echo "Bigdl-llm gpu inference tests for transformers 4.34.0 finished" -echo "Time used:$time seconds" diff --git a/python/llm/test/run-llm-inference-tests-gpu.sh b/python/llm/test/run-llm-inference-tests-gpu.sh index ea1abb519f4..5e48c0df876 100644 --- a/python/llm/test/run-llm-inference-tests-gpu.sh +++ b/python/llm/test/run-llm-inference-tests-gpu.sh @@ -21,9 +21,9 @@ pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s export BIGDL_LLM_XMX_DISABLED=1 pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "not Mistral" -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "not Mistral" -pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "not Mistral" +pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s +pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s +pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s unset BIGDL_LLM_XMX_DISABLED now=$(date "+%s") diff --git a/python/llm/test/run-llm-inference-tests.sh b/python/llm/test/run-llm-inference-tests.sh index e53528dbb56..d3c3c0690ef 100644 --- a/python/llm/test/run-llm-inference-tests.sh +++ b/python/llm/test/run-llm-inference-tests.sh @@ -18,10 +18,6 @@ export OMP_NUM_THREADS=$THREAD_NUM python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v -python -m pip install transformers==4.34.0 -python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformesr_api_434.py -v -python -m pip install transformers==4.31.0 - now=$(date "+%s") time=$((now-start))