diff --git a/.github/actions/llm/cli-test-windows/action.yml b/.github/actions/llm/cli-test-windows/action.yml deleted file mode 100644 index 4ebd5a0d3fc..00000000000 --- a/.github/actions/llm/cli-test-windows/action.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: "llm-cli Flow Verification (Windows)" -description: "Verify the llm-cli flow on Windows" - -runs: - using: "composite" - steps: - - name: Test llama llm-cli - shell: powershell - run: | - llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x llama -m $env:LLAMA_INT4_CKPT_PATH -p 'Once upon a time,' - - - name: Test gptneox llm-cli - shell: powershell - run: | - llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x gptneox -m $env:GPTNEOX_INT4_CKPT_PATH -p 'Once upon a time,' - - - name: Test bloom llm-cli - shell: powershell - run: | - llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x bloom -m $env:BLOOM_INT4_CKPT_PATH -p 'Once upon a time,' - - # - name: Test starcoder llm-cli - # shell: powershell - # run: | - # llm-cli.ps1 -t $env:THREAD_NUM -x starcoder -m $env:STARCODER_INT4_CKPT_PATH -p 'def check_odd(' \ No newline at end of file diff --git a/.github/workflows/llm-c-evaluation.yml b/.github/workflows/llm-c-evaluation.yml index 9ca18276c75..cb890f19eec 100644 --- a/.github/workflows/llm-c-evaluation.yml +++ b/.github/workflows/llm-c-evaluation.yml @@ -10,12 +10,12 @@ permissions: # Controls when the action will run. on: - schedule: - - cron: "00 15 * * *" # GMT time, 15:00 GMT == 23:00 Beijing Time - pull_request: - branches: [main] - paths: - - ".github/workflows/llm-c-evaluation.yml" + # schedule: + # - cron: "00 15 * * *" # GMT time, 15:00 GMT == 23:00 Beijing Time + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm-c-evaluation.yml" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: inputs: @@ -204,7 +204,7 @@ jobs: pip install pandas==1.5.3 - name: Download ceval results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: ceval_results path: results @@ -259,7 +259,7 @@ jobs: fi - name: Download ceval results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: results_${{ needs.set-matrix.outputs.date }} path: ${{ env.ACC_FOLDER }} diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index e3e1993a9c0..839393bb49c 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -10,12 +10,12 @@ permissions: # Controls when the action will run. on: - schedule: - - cron: "30 12 * * *" # GMT time, 12:30 GMT == 20:30 China - pull_request: - branches: [main] - paths: - - ".github/workflows/llm-harness-evaluation.yml" + # schedule: + # - cron: "30 12 * * *" # GMT time, 12:30 GMT == 20:30 China + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm-harness-evaluation.yml" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: inputs: @@ -220,7 +220,7 @@ jobs: pip install --upgrade pip pip install jsonlines pytablewriter regex - name: Download all results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: harness_results path: results @@ -260,7 +260,7 @@ jobs: fi - name: Download harness results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: harness_results path: ${{ env.ACC_FOLDER}}/${{ env.DATE }} diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml index 7c2037ff318..6a64502ffbb 100644 --- a/.github/workflows/llm-ppl-evaluation.yml +++ b/.github/workflows/llm-ppl-evaluation.yml @@ -10,12 +10,12 @@ permissions: # Controls when the action will run. on: - schedule: - - cron: "00 12 * * *" # GMT time, 12:00 GMT == 20:00 China - pull_request: - branches: [main] - paths: - - ".github/workflows/llm-ppl-evaluation.yml" + # schedule: + # - cron: "00 12 * * *" # GMT time, 12:00 GMT == 20:00 China + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm-ppl-evaluation.yml" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: inputs: @@ -206,7 +206,7 @@ jobs: pip install --upgrade pip pip install jsonlines pytablewriter regex - name: Download all results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: ppl_results path: results @@ -245,7 +245,7 @@ jobs: fi - name: Download ppl results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: ppl_results path: ${{ env.ACC_FOLDER}}/${{ env.DATE }} diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml index e60eadbf1df..538e10e56b0 100644 --- a/.github/workflows/llm-whisper-evaluation.yml +++ b/.github/workflows/llm-whisper-evaluation.yml @@ -10,12 +10,12 @@ permissions: # Controls when the action will run. on: - schedule: - - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China - pull_request: - branches: [main] - paths: - - ".github/workflows/llm-whisper-evaluation.yml" + # schedule: + # - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China + # pull_request: + # branches: [main] + # paths: + # - ".github/workflows/llm-whisper-evaluation.yml" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: inputs: @@ -176,14 +176,14 @@ jobs: - name: Download all results for nightly run if: github.event_name == 'schedule' - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: whisper_results path: ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} - name: Download all results for pr run if: github.event_name == 'pull_request' - uses: actions/download-artifact@v3 + uses: actions/download-artifact@4.1.7 with: name: whisper_results path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 9185e4282d5..44c3fb2ec7e 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -130,10 +130,11 @@ jobs: if: ${{ github.event.schedule || (github.event_name == 'workflow_dispatch' && (inputs.checkout-ref == 'main')) }} shell: bash run: | - pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ test_version_date=`date -d 'yesterday' '+%Y%m%d'` - if ! pip show ipex-llm | grep $test_version_date; then - echo "Did not install ipex-llm with excepted version $test_version_date" + test_version=2.2.0b$test_version_date + pip install --pre --upgrade ipex-llm[xpu]==$test_version --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ + if ! pip show ipex-llm | grep $test_version; then + echo "Did not install ipex-llm with excepted version $test_version" exit 1 fi @@ -153,7 +154,8 @@ jobs: source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 - cp python/llm/test/benchmark/arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml + pip install transformers==4.41.2, trl + cp python/llm/test/benchmark/arc-perf-transformers-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one mkdir test_batch1 mkdir test_batch2 @@ -167,7 +169,7 @@ jobs: mv *.csv test_batch1 # batch_size 2 cd ../../../../../ - cp python/llm/test/benchmark/arc-perf-test-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cp python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # change csv name sed -i 's/batch1/batch2/g' run.py @@ -175,7 +177,7 @@ jobs: mv *.csv test_batch2 # batch_size 4 cd ../../../../../ - cp python/llm/test/benchmark/arc-perf-test-batch4.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cp python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # change csv name sed -i 's/batch2/batch4/g' run.py @@ -188,8 +190,8 @@ jobs: source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 - # upgrade transformers for model Qwen/Qwen1.5-7B-Chat - python -m pip install transformers==4.37.0 + # upgrade for default transformers version + python -m pip install transformers==4.41.2, trl # batch_size 1 cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one @@ -221,7 +223,7 @@ jobs: export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # upgrade transformers for model Qwen/Qwen1.5-MoE-A2.7B-Chat - python -m pip install transformers==4.40.0 + python -m pip install transformers==4.41.2 python -m pip install trl # batch_size 1 cp python/llm/test/benchmark/arc-perf-transformers-440.yaml python/llm/dev/benchmark/all-in-one/config.yaml @@ -314,7 +316,7 @@ jobs: run: | # batch_size 1 cd python/llm/dev/benchmark/all-in-one/test_batch1 - python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test.yaml + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-transformers-436.yaml python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437.yaml python ../../../../test/benchmark/check_results.py -c test3 -y ../../../../test/benchmark/arc-perf-transformers-440.yaml find . -name "*test*.csv" -delete @@ -327,7 +329,7 @@ jobs: rm -r test_batch1 # batch_size 2 cd test_batch2 - python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test-batch2.yaml + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-transformers-436-batch2.yaml python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437-batch2.yaml find . -name "*test*.csv" -delete if [[ ${{ github.event_name }} == "schedule" ]]; then @@ -339,7 +341,7 @@ jobs: rm -r test_batch2 # batch_size 4 cd test_batch4 - python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test-batch4.yaml + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-transformers-436-batch4.yaml python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437-batch4.yaml find . -name "*test*.csv" -delete if [[ ${{ github.event_name }} == "schedule" ]]; then @@ -384,7 +386,6 @@ jobs: python -m pip install --upgrade einops python -m pip install --upgrade tiktoken python -m pip install --upgrade transformers_stream_generator - # specific for test on certain commits - name: Download llm binary if: ${{ github.event_name == 'workflow_dispatch' && (inputs.checkout-ref != 'main') }} @@ -398,10 +399,11 @@ jobs: if: ${{ github.event.schedule || (github.event_name == 'workflow_dispatch' && (inputs.checkout-ref == 'main')) }} shell: bash run: | - pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu test_version_date=`date -d 'yesterday' '+%Y%m%d'` - if ! pip show ipex-llm | grep $test_version_date; then - echo "Did not install ipex-llm with excepted version $test_version_date" + test_version=2.2.0b$test_version_date + pip install --pre --upgrade ipex-llm[all]==$test_version --extra-index-url https://download.pytorch.org/whl/cpu + if ! pip show ipex-llm | grep $test_version; then + echo "Did not install ipex-llm with excepted version $test_version" exit 1 fi @@ -417,6 +419,8 @@ jobs: export https_proxy=${HTTPS_PROXY} source ipex-llm-init -t export OMP_NUM_THREADS=48 + # upgrade for default transformers version + python -m pip install transformers==4.41.2, trl # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py python run.py @@ -478,10 +482,11 @@ jobs: if: ${{ github.event.schedule || (github.event_name == 'workflow_dispatch' && (inputs.checkout-ref == 'main')) }} shell: bash run: | - pip install --pre --upgrade ipex-llm[all] test_version_date=`date -d 'yesterday' '+%Y%m%d'` - if ! pip show ipex-llm | grep $test_version_date; then - echo "Did not install ipex-llm with excepted version $test_version_date" + test_version=2.2.0b$test_version_date + pip install --pre --upgrade ipex-llm[all]==$test_version + if ! pip show ipex-llm | grep $test_version; then + echo "Did not install ipex-llm with excepted version $test_version" exit 1 fi @@ -496,6 +501,8 @@ jobs: cd python/llm/dev/benchmark/all-in-one export http_proxy=${HTTP_PROXY} export https_proxy=${HTTPS_PROXY} + # upgrade for default transformers version + python -m pip install transformers==4.41.2, trl # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py python run.py @@ -554,7 +561,7 @@ jobs: pip install --upgrade pip pip install --upgrade wheel pip install --upgrade omegaconf pandas - pip install --upgrade tiktoken einops transformers_stream_generator matplotlib + pip install --upgrade tiktoken einops transformers_stream_generator matplotlib trl cd python\llm python setup.py clean --all bdist_wheel --win @@ -572,7 +579,8 @@ jobs: shell: bash run: | test_version_date=`date -d 'yesterday' '+%Y%m%d'` - echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV" + test_version=2.2.0b$test_version_date + echo "TEST_VERSION=${test_version}" >> "$GITHUB_ENV" - name: Install ipex-llm and other related packages (install from pypi) if: ${{ github.event.schedule || (github.event_name == 'workflow_dispatch' && (inputs.checkout-ref == 'main')) }} @@ -586,10 +594,10 @@ jobs: pip install --upgrade omegaconf pandas pip install --upgrade tiktoken einops transformers_stream_generator matplotlib - pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ - pip show ipex-llm | findstr %TEST_VERSION_DATE% + pip install --pre --upgrade ipex-llm[xpu]==%TEST_VERSION% --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ + pip show ipex-llm | findstr %TEST_VERSION% if %ERRORLEVEL% neq 0 ( - echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%" + echo "Did not install ipex-llm with excepted version %TEST_VERSION%" exit /b 1 ) pip list @@ -649,6 +657,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 REM for llava @@ -664,23 +673,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (32-32 int4+fp16) + - name: Prepare igpu perf test for transformers 4.36 (32-32 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml - - name: Test on igpu for transformers 4.37 (32-32 int4+fp16) + - name: Test on igpu for transformers 4.36 (32-32 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -699,7 +708,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -714,6 +723,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (32-32 int4+fp16) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml + + - name: Test on igpu for transformers 4.43 (32-32 int4+fp16) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (32-32 int4+fp16) shell: cmd run: | @@ -737,14 +773,14 @@ jobs: shell: bash run: | sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml - name: Test on igpu (1024-128 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.0 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -761,23 +797,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (1024-128 int4+fp16) + - name: Prepare igpu perf test for transformers 4.36 (1024-128 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml - - name: Test on igpu for transformers 4.37 (1024-128 int4+fp16) + - name: Test on igpu for transformers 4.36 (1024-128 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -785,7 +821,7 @@ jobs: if %ERRORLEVEL% neq 0 (exit /b 1) call conda deactivate - + - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16) shell: bash run: | @@ -796,7 +832,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -811,6 +847,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (1024-128 int4+fp16) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml + + - name: Test on igpu for transformers 4.43 (1024-128 int4+fp16) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (1024-128 int4+fp16) shell: cmd run: | @@ -833,14 +896,14 @@ jobs: shell: bash run: | sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml - name: Test on igpu (2048-256 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -857,23 +920,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (2048-256 int4+fp16) + - name: Prepare igpu perf test for transformers 4.36 (2048-256 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml - - name: Test on igpu for transformers 4.37 (2048-256 int4+fp16) + - name: Test on igpu for transformers 4.36 (2048-256 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -881,7 +944,7 @@ jobs: if %ERRORLEVEL% neq 0 (exit /b 1) call conda deactivate - + - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16) shell: bash run: | @@ -892,7 +955,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -907,6 +970,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (2048-256 int4+fp16) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml + + - name: Test on igpu for transformers 4.43 (2048-256 int4+fp16) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (2048-256 int4+fp16) shell: cmd run: | @@ -929,14 +1019,14 @@ jobs: shell: bash run: | sed -i 's/2048-256/3072-384/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml - name: Test on igpu (3072-384 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -953,23 +1043,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (3072-384 int4+fp16) + - name: Prepare igpu perf test for transformers 4.36 (3072-384 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml - - name: Test on igpu for transformers 4.37 (3072-384 int4+fp16) + - name: Test on igpu for transformers 4.36 (3072-384 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\3072-384_int4_fp16_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\3072-384_int4_fp16_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\3072-384_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -988,7 +1078,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1003,6 +1093,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (3072-384 int4+fp16) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml + + - name: Test on igpu for transformers 4.43 (3072-384 int4+fp16) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\3072-384_int4_fp16_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\3072-384_int4_fp16\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (3072-384 int4+fp16) shell: cmd run: | @@ -1025,14 +1142,14 @@ jobs: shell: bash run: | sed -i 's/3072-384/4096-512/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml - name: Test on igpu (4096-512 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1049,23 +1166,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (4096-512 int4+fp16) + - name: Prepare igpu perf test for transformers 4.38 (4096-512 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_438.yaml - - name: Test on igpu for transformers 4.37 (4096-512 int4+fp16) + - name: Test on igpu for transformers 4.38 (4096-512 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\4096-512_int4_fp16_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\4096-512_int4_fp16_438.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\4096-512_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -1074,29 +1191,31 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.38 (4096-512 int4+fp16) + - name: Prepare igpu perf test for transformers 4.43 (4096-512 int4+fp16) shell: bash run: | sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_438.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml - - name: Test on igpu for transformers 4.38 (4096-512 int4+fp16) + - name: Test on igpu for transformers 4.43 (4096-512 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.43.1 + pip install trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\4096-512_int4_fp16_438.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\4096-512_int4_fp16_443.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\4096-512_int4_fp16\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3 if %ERRORLEVEL% neq 0 (exit /b 1) + pip uninstall trl -y call conda deactivate - name: Concat csv and generate html (4096-512 int4+fp16) @@ -1128,7 +1247,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1145,23 +1264,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (load_low_bit 1024-128 int4+fp16) + - name: Prepare igpu perf test for transformers 4.36 (load_low_bit 1024-128 int4+fp16) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml - - name: Test on igpu for transformers 4.37 (load_low_bit 1024-128 int4+fp16) + - name: Test on igpu for transformers 4.36 (load_low_bit 1024-128 int4+fp16) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_loadlowbit_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_loadlowbit_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16_loadlowbit\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -1180,7 +1299,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1195,6 +1314,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (load_low_bit 1024-128 int4+fp16) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml + + - name: Test on igpu for transformers 4.43 (load_low_bit 1024-128 int4+fp16) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_loadlowbit_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16_loadlowbit\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (load_low_bit 1024-128 int4+fp16) shell: cmd run: | @@ -1216,14 +1362,14 @@ jobs: - name: Prepare igpu perf test (1024-128) shell: bash run: | - sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml - name: Test on igpu (1024-128) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.36.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1240,23 +1386,23 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for transformers 4.37 (1024-128) + - name: Prepare igpu perf test for transformers 4.36 (1024-128) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py - sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_436.yaml - - name: Test on igpu for transformers 4.37 (1024-128) + - name: Test on igpu for transformers 4.36 (1024-128) shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.37.0 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 cd python\llm\dev\benchmark\all-in-one - move ..\..\..\test\benchmark\igpu-perf\1024-128_437.yaml config.yaml + move ..\..\..\test\benchmark\igpu-perf\1024-128_436.yaml config.yaml set PYTHONIOENCODING=utf-8 python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1 if %ERRORLEVEL% neq 0 (exit /b 1) @@ -1275,7 +1421,7 @@ jobs: shell: cmd run: | call conda activate igpu-perf - pip install transformers==4.38.2 + pip install transformers==4.41.2 trl set SYCL_CACHE_PERSISTENT=1 set BIGDL_LLM_XMX_DISABLED=1 @@ -1290,6 +1436,33 @@ jobs: call conda deactivate + - name: Prepare igpu perf test for transformers 4.43 (1024-128) + shell: bash + run: | + sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py + sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_443.yaml + + - name: Test on igpu for transformers 4.43 (1024-128) + shell: cmd + run: | + call conda activate igpu-perf + pip install transformers==4.43.1 + pip install trl + + set SYCL_CACHE_PERSISTENT=1 + set BIGDL_LLM_XMX_DISABLED=1 + + cd python\llm\dev\benchmark\all-in-one + move ..\..\..\test\benchmark\igpu-perf\1024-128_443.yaml config.yaml + set PYTHONIOENCODING=utf-8 + python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1 + if %ERRORLEVEL% neq 0 (exit /b 1) + python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4 + if %ERRORLEVEL% neq 0 (exit /b 1) + + pip uninstall trl -y + call conda deactivate + - name: Concat csv and generate html (1024-128) shell: cmd run: | @@ -1331,4 +1504,3 @@ jobs: # shell: cmd # run: | # call conda env remove -n igpu-perf -y - diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml index 942ab06558e..2268f031f88 100644 --- a/.github/workflows/llm_unit_tests.yml +++ b/.github/workflows/llm_unit_tests.yml @@ -215,9 +215,37 @@ jobs: - name: Run LLM cli test (Linux) if: runner.os == 'Linux' uses: ./.github/actions/llm/cli-test-linux + + - name: Setup Python Path + if: runner.os == 'Windows' + shell: bash + run: | + # Get Python interpreter path + python_path=$(python -c 'import sys; print(sys.executable)') + python_dir=$(dirname "$python_path") + scripts_dir="$python_dir/Scripts" + + # Set environment variables + echo "PYTHON_DIR=$python_dir" >> $GITHUB_ENV + echo "SCRIPTS_DIR=$scripts_dir" >> $GITHUB_ENV + - name: Run LLM cli test (Windows) if: runner.os == 'Windows' - uses: ./.github/actions/llm/cli-test-windows + shell: powershell + run: | + # Retrieve environment variables + $pythonDir = $env:PYTHON_DIR + $scriptsDir = $env:SCRIPTS_DIR + + # Update PATH + $env:PATH = "$pythonDir;$scriptsDir;$env:PATH" + + # Run tests + llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x llama -m $env:LLAMA_INT4_CKPT_PATH -p 'Once upon a time,' + llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x gptneox -m $env:GPTNEOX_INT4_CKPT_PATH -p 'Once upon a time,' + llm-cli.ps1 -t $env:THREAD_NUM -n 256 -x bloom -m $env:BLOOM_INT4_CKPT_PATH -p 'Once upon a time,' + # llm-cli.ps1 -t $env:THREAD_NUM -x starcoder -m $env:STARCODER_INT4_CKPT_PATH -p 'def check_odd(' + - name: Run LLM inference test shell: bash run: | diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml index bc84d9f094e..dd4d4d7b04d 100644 --- a/.github/workflows/manually_build.yml +++ b/.github/workflows/manually_build.yml @@ -24,9 +24,9 @@ on: # - ipex-llm-finetune-qlora-cpu-k8s # - ipex-llm-finetune-xpu # tag: - # description: 'docker image tag (e.g. 2.1.0-SNAPSHOT)' + # description: 'docker image tag (e.g. 2.2.0-SNAPSHOT)' # required: true - # default: '2.1.0-SNAPSHOT' + # default: '2.2.0-SNAPSHOT' # type: string workflow_call: inputs: @@ -40,9 +40,9 @@ on: default: 'all' type: string tag: - description: 'docker image tag (e.g. 2.1.0-SNAPSHOT)' + description: 'docker image tag (e.g. 2.2.0-SNAPSHOT)' required: true - default: '2.1.0-SNAPSHOT' + default: '2.2.0-SNAPSHOT' type: string public: description: "if the docker image push to public docker hub" diff --git a/.github/workflows/release-ipex-llm.yaml b/.github/workflows/release-ipex-llm.yaml index 5a2284db1f0..272aa1a8342 100644 --- a/.github/workflows/release-ipex-llm.yaml +++ b/.github/workflows/release-ipex-llm.yaml @@ -4,9 +4,9 @@ on: workflow_dispatch: inputs: version: - description: 'ipex-llm version (e.g. 2.1.0b1)' + description: 'ipex-llm version (e.g. 2.2.0b1)' required: true - default: '2.1.0b0' + default: '2.2.0b0' type: string permissions: @@ -26,7 +26,7 @@ jobs: - name: set release version env: - DEFAULT_VERSION: '2.1.0b0' + DEFAULT_VERSION: '2.2.0b0' run: | echo "RELEASE_VERSION=${{ github.event.inputs.version || env.DEFAULT_VERSION }}" >> $GITHUB_ENV diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 68c89557552..e6ac2662b92 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -15,7 +15,7 @@ on: required: false type: string release-version: - description: 'ipex-llm version (e.g. 2.1.0b1)' + description: 'ipex-llm version (e.g. 2.2.0b1)' required: false type: string schedule-event: @@ -60,7 +60,7 @@ jobs: run: | if [[ "${{ inputs.schedule-event }}" == "true" ]]; then export TIMESTAMP=`date '+%Y%m%d'` - export PYPI_VERSION=2.1.0 + export PYPI_VERSION=2.2.0 export RELEASE_VERSION=${PYPI_VERSION}b${TIMESTAMP} else export RELEASE_VERSION=${{ inputs.release-version }} diff --git a/README.md b/README.md index 555180c44fe..b211f604822 100644 --- a/README.md +++ b/README.md @@ -158,9 +158,15 @@ See the demo of running [*Text-Generation-WebUI*](https://ipex-llm.readthedocs.i Please see the **Perplexity** result below (tested on Wikitext dataset using the script [here](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/dev/benchmark/perplexity)). |Perplexity |sym_int4 |q4_k |fp6 |fp8_e5m2 |fp8_e4m3 |fp16 | |---------------------------|---------|-------|-------|---------|---------|-------| -|Llama-2-7B-chat-hf |6.3638 |6.2179 |6.0924 |6.1796 |6.0980 |6.0963 | -|Mistral-7B-Instruct-v0.1 |6.0025 |5.9581 |5.8930 |5.8884 |5.8820 |5.8734 | -|Qwen1.5-7B-chat |8.8652 |8.8163 |8.5573 |8.8463 |8.5304 |8.6065 | +|Llama-2-7B-chat-hf |6.364 |6.218 |6.092 |6.180 |6.098 |6.096 | +|Mistral-7B-Instruct-v0.2 |5.365 |5.320 |5.270 |5.273 |5.246 |5.244 | +|Baichuan2-7B-chat |6.734 |6.727 |6.527 |6.539 |6.488 |6.508 | +|Qwen1.5-7B-chat |8.865 |8.816 |8.557 |8.846 |8.530 |8.607 | +|Llama-3.1-8B-Instruct |6.705 |6.566 |6.338 |6.383 |6.325 |6.267 | +|gemma-2-9b-it |7.541 |7.412 |7.269 |7.380 |7.268 |7.270 | +|Baichuan2-13B-Chat |6.313 |6.160 |6.070 |6.145 |6.086 |6.031 | +|Llama-2-13b-chat-hf |5.449 |5.422 |5.341 |5.384 |5.332 |5.329 | +|Qwen1.5-14B-Chat |7.529 |7.520 |7.367 |7.504 |7.297 |7.334 | [^1]: Performance varies by use, configuration and other factors. `ipex-llm` may not optimize to the same degree for non-Intel products. Learn more at www.Intel.com/PerformanceIndex. @@ -270,6 +276,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM | Qwen1.5 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5) | [link](python/llm/example/GPU/HuggingFace/LLM/qwen1.5) | | Qwen2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2) | [link](python/llm/example/GPU/HuggingFace/LLM/qwen2) | | Qwen-VL | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl) | [link](python/llm/example/GPU/HuggingFace/Multimodal/qwen-vl) | +| Qwen2-Audio | | [link](python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio) | | Aquila | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila) | [link](python/llm/example/GPU/HuggingFace/LLM/aquila) | | Aquila2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2) | [link](python/llm/example/GPU/HuggingFace/LLM/aquila2) | | MOSS | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss) | | @@ -312,6 +319,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM | MiniCPM-V | | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V) | | MiniCPM-V-2 | | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2) | | MiniCPM-Llama3-V-2_5 | | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5) | +| MiniCPM-V-2_6 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6) | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6) | ## Get Support - Please report a bug or raise a feature request by opening a [Github Issue](https://github.com/intel-analytics/ipex-llm/issues) diff --git a/docker/llm/README.md b/docker/llm/README.md index 2af11953439..b50718ff1fd 100644 --- a/docker/llm/README.md +++ b/docker/llm/README.md @@ -13,20 +13,20 @@ You can run IPEX-LLM containers (via docker or k8s) for inference, serving and f #### Pull a IPEX-LLM Docker Image To pull IPEX-LLM Docker images from [Docker Hub](https://hub.docker.com/u/intelanalytics), use the `docker pull` command. For instance, to pull the CPU inference image: ```bash -docker pull intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT ``` Available images in hub are: | Image Name | Description | | --- | --- | -| intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT | CPU Inference | -| intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT | GPU Inference | -| intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT | CPU Serving| -| intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT | GPU Serving| -| intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT | CPU Finetuning via Docker| -| intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT|CPU Finetuning via Kubernetes| -| intelanalytics/ipex-llm-finetune-qlora-xpu:2.1.0-SNAPSHOT| GPU Finetuning| +| intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT | CPU Inference | +| intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT | GPU Inference | +| intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT | CPU Serving| +| intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT | GPU Serving| +| intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT | CPU Finetuning via Docker| +| intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT|CPU Finetuning via Kubernetes| +| intelanalytics/ipex-llm-finetune-qlora-xpu:2.2.0-SNAPSHOT| GPU Finetuning| #### Run a Container Use `docker run` command to run an IPEX-LLM docker container. For detailed instructions, refer to the [IPEX-LLM Docker Container Guides](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/DockerGuides/index.html). diff --git a/docker/llm/README_backup.md b/docker/llm/README_backup.md index 409461080ed..2b8e73afe8b 100644 --- a/docker/llm/README_backup.md +++ b/docker/llm/README_backup.md @@ -30,14 +30,14 @@ This guide provides step-by-step instructions for installing and using IPEX-LLM Run the following command to pull image: ```bash -docker pull intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT ``` ### 2. Start bigdl-llm-cpu Docker Container ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT export CONTAINER_NAME=my_container export MODEL_PATH=/llm/models[change to your model path] @@ -156,7 +156,7 @@ Additionally, for examples related to Inference with Speculative Decoding, you c Run the following command to pull image from dockerhub: ```bash -docker pull intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT ``` ### 2. Start Chat Inference @@ -167,7 +167,7 @@ To map the xpu into the container, you need to specify --device=/dev/dri when bo ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT export CONTAINER_NAME=my_container export MODEL_PATH=/llm/models[change to your model path] @@ -189,7 +189,7 @@ Execute a quick performance benchmark by starting the ipex-llm-xpu container, sp To map the XPU into the container, specify `--device=/dev/dri` when booting the container. ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT export CONTAINER_NAME=my_container export MODEL_PATH=/llm/models [change to your model path] @@ -226,7 +226,7 @@ IPEX-LLM is integrated into FastChat so that user can use IPEX-LLM as a serving Run the following command: ```bash -docker pull intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT ``` ### 2. Start ipex-llm-serving-cpu Docker Container @@ -234,7 +234,7 @@ docker pull intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT Please be noted that the CPU config is specified for Xeon CPUs, change it accordingly if you are not using a Xeon CPU. ```bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT export CONTAINER_NAME=my_container export MODEL_PATH=/llm/models[change to your model path] @@ -349,7 +349,7 @@ IPEX-LLM is integrated into FastChat so that user can use IPEX-LLM as a serving Run the following command: ```bash -docker pull intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT ``` ### 2. Start ipex-llm-serving-xpu Docker Container @@ -357,7 +357,7 @@ docker pull intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container. ```bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT export CONTAINER_NAME=my_container export MODEL_PATH=/llm/models[change to your model path] @@ -473,10 +473,10 @@ You can download directly from Dockerhub like: ```bash # For standalone -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT # For k8s -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT ``` Or build the image from source: @@ -489,7 +489,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT \ -f ./Dockerfile . # For k8s @@ -499,7 +499,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT \ -f ./Dockerfile.k8s . ``` @@ -520,7 +520,7 @@ docker run -itd \ -e https_proxy=${HTTPS_PROXY} \ -v $BASE_MODE_PATH:/ipex_llm/model \ -v $DATA_PATH:/ipex_llm/data/alpaca-cleaned \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT ``` The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files: @@ -534,7 +534,7 @@ docker run -itd \ --name=ipex-llm-fintune-qlora-cpu \ -e http_proxy=${HTTP_PROXY} \ -e https_proxy=${HTTPS_PROXY} \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT ``` However, we do recommend you to handle them manually, because the automatical download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset). @@ -593,7 +593,7 @@ The following shows how to fine-tune LLM with Quantization (QLoRA built on IPEX- Run the following command: ```bash -docker pull intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT ``` ### 2. Prepare Base Model, Data and Start Docker Container @@ -606,7 +606,7 @@ export DATA_PATH=your_downloaded_data_path export HTTP_PROXY=your_http_proxy export HTTPS_PROXY=your_https_proxy export CONTAINER_NAME=my_container -export DOCKER_IMAGE=intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT docker run -itd \ --net=host \ diff --git a/docker/llm/finetune/lora/cpu/docker/README.md b/docker/llm/finetune/lora/cpu/docker/README.md index de5df38fa44..4dd78ebec84 100644 --- a/docker/llm/finetune/lora/cpu/docker/README.md +++ b/docker/llm/finetune/lora/cpu/docker/README.md @@ -5,7 +5,7 @@ You can download directly from Dockerhub like: ```bash -docker pull intelanalytics/ipex-llm-finetune-lora-cpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT ``` Or build the image from source: @@ -17,7 +17,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-lora-cpu:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT \ -f ./Dockerfile . ``` @@ -33,7 +33,7 @@ docker run -itd \ -e WORKER_COUNT_DOCKER=your_worker_count \ -v your_downloaded_base_model_path:/ipex_llm/model \ -v your_downloaded_data_path:/ipex_llm/data/alpaca_data_cleaned_archive.json \ - intelanalytics/ipex-llm-finetune-lora-cpu:2.1.0-SNAPSHOT \ + intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT \ bash ``` diff --git a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml index aebfd76792b..4555bfa383e 100644 --- a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml +++ b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml @@ -1,4 +1,4 @@ -imageName: intelanalytics/ipex-llm-finetune-lora-cpu:2.1.0-SNAPSHOT +imageName: intelanalytics/ipex-llm-finetune-lora-cpu:2.2.0-SNAPSHOT trainerNum: 8 microBatchSize: 8 nfsServerIp: your_nfs_server_ip diff --git a/docker/llm/finetune/qlora/cpu/docker/README.md b/docker/llm/finetune/qlora/cpu/docker/README.md index c50daa8ddf4..4ede345437d 100644 --- a/docker/llm/finetune/qlora/cpu/docker/README.md +++ b/docker/llm/finetune/qlora/cpu/docker/README.md @@ -8,10 +8,10 @@ You can download directly from Dockerhub like: ```bash # For standalone -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT # For k8s -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT ``` Or build the image from source: @@ -24,7 +24,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT \ -f ./Dockerfile . # For k8s @@ -34,7 +34,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT \ -f ./Dockerfile.k8s . ``` @@ -55,7 +55,7 @@ docker run -itd \ -e https_proxy=${HTTPS_PROXY} \ -v $BASE_MODE_PATH:/ipex_llm/model \ -v $DATA_PATH:/ipex_llm/data/alpaca-cleaned \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT ``` The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files: @@ -69,7 +69,7 @@ docker run -itd \ --name=ipex-llm-fintune-qlora-cpu \ -e http_proxy=${HTTP_PROXY} \ -e https_proxy=${HTTPS_PROXY} \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT ``` However, we do recommend you to handle them manually, because the automatical download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset). @@ -130,7 +130,7 @@ docker run -itd \ -e WORKER_COUNT_DOCKER=your_worker_count \ -v your_downloaded_base_model_path:/ipex_llm/model \ -v your_downloaded_data_path:/ipex_llm/data/alpaca_data_cleaned_archive.json \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.2.0-SNAPSHOT ``` Note that `STANDALONE_DOCKER` is set to **TRUE** here. diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml index ccb85047960..89087c6efe3 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml +++ b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml @@ -1,4 +1,4 @@ -imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT +imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.2.0-SNAPSHOT trainerNum: 2 microBatchSize: 8 enableGradientCheckpoint: false # true will save more memory but increase latency diff --git a/docker/llm/finetune/xpu/README.md b/docker/llm/finetune/xpu/README.md index d9579d0825e..d21c6e655d8 100644 --- a/docker/llm/finetune/xpu/README.md +++ b/docker/llm/finetune/xpu/README.md @@ -19,7 +19,7 @@ With this docker image, we can use all [ipex-llm finetune examples on Intel GPU] You can download directly from Dockerhub like: ```bash -docker pull intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT ``` Or build the image from source: @@ -31,7 +31,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT \ -f ./Dockerfile . ``` @@ -55,7 +55,7 @@ docker run -itd \ -v $BASE_MODE_PATH:/model \ -v $DATA_PATH:/data/alpaca-cleaned \ --shm-size="16g" \ - intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT ``` The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files: @@ -72,7 +72,7 @@ docker run -itd \ -e http_proxy=${HTTP_PROXY} \ -e https_proxy=${HTTPS_PROXY} \ --shm-size="16g" \ - intelanalytics/ipex-llm-finetune-xpu:2.1.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-xpu:2.2.0-SNAPSHOT ``` However, we do recommend you to handle them manually, because the download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset). diff --git a/docker/llm/inference/cpu/docker/README.md b/docker/llm/inference/cpu/docker/README.md index c8babfacac0..a0f80bdcf9b 100644 --- a/docker/llm/inference/cpu/docker/README.md +++ b/docker/llm/inference/cpu/docker/README.md @@ -6,7 +6,7 @@ docker build \ --build-arg http_proxy=.. \ --build-arg https_proxy=.. \ --build-arg no_proxy=.. \ - --rm --no-cache -t intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT . + --rm --no-cache -t intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT . ``` @@ -16,7 +16,7 @@ docker build \ An example could be: ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT sudo docker run -itd \ --net=host \ @@ -41,7 +41,7 @@ You can download models and bind the model directory from host machine to contai Here is an example: ```bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT export MODEL_PATH=/home/llm/models sudo docker run -itd \ diff --git a/docker/llm/inference/xpu/docker/README.md b/docker/llm/inference/xpu/docker/README.md index 0ce773c2499..6f9dcec658e 100644 --- a/docker/llm/inference/xpu/docker/README.md +++ b/docker/llm/inference/xpu/docker/README.md @@ -6,7 +6,7 @@ docker build \ --build-arg http_proxy=.. \ --build-arg https_proxy=.. \ --build-arg no_proxy=.. \ - --rm --no-cache -t intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT . + --rm --no-cache -t intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT . ``` @@ -17,7 +17,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe An example could be: ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT sudo docker run -itd \ --net=host \ diff --git a/docker/llm/serving/cpu/docker/Dockerfile b/docker/llm/serving/cpu/docker/Dockerfile index 045171c284c..f60033b509d 100644 --- a/docker/llm/serving/cpu/docker/Dockerfile +++ b/docker/llm/serving/cpu/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT +FROM intelanalytics/ipex-llm-cpu:2.2.0-SNAPSHOT ARG http_proxy ARG https_proxy diff --git a/docker/llm/serving/cpu/docker/README.md b/docker/llm/serving/cpu/docker/README.md index 5aa2cb1355e..0aa2471d267 100644 --- a/docker/llm/serving/cpu/docker/README.md +++ b/docker/llm/serving/cpu/docker/README.md @@ -6,7 +6,7 @@ docker build \ --build-arg http_proxy=.. \ --build-arg https_proxy=.. \ --build-arg no_proxy=.. \ - --rm --no-cache -t intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT . + --rm --no-cache -t intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT . ``` ### Use the image for doing cpu serving @@ -16,7 +16,7 @@ You could use the following bash script to start the container. Please be noted ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT sudo docker run -itd \ --net=host \ diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md index 7e8cb0e5b2d..a58a098c08c 100644 --- a/docker/llm/serving/cpu/kubernetes/README.md +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -2,7 +2,7 @@ ## Image -To deploy IPEX-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT` +To deploy IPEX-LLM-serving cpu in Kubernetes environment, please use this image: `intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT` ## Before deployment @@ -73,7 +73,7 @@ spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-controller # fixed - image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT + image: intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed @@ -146,7 +146,7 @@ spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-worker # fixed - image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT + image: intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed diff --git a/docker/llm/serving/cpu/kubernetes/deployment.yaml b/docker/llm/serving/cpu/kubernetes/deployment.yaml index d1aaf5c140e..623f3894e59 100644 --- a/docker/llm/serving/cpu/kubernetes/deployment.yaml +++ b/docker/llm/serving/cpu/kubernetes/deployment.yaml @@ -24,7 +24,7 @@ spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-controller # fixed - image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT + image: intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed @@ -91,7 +91,7 @@ spec: dnsPolicy: "ClusterFirst" containers: - name: fastchat-worker # fixed - image: intelanalytics/ipex-llm-serving-cpu:2.1.0-SNAPSHOT + image: intelanalytics/ipex-llm-serving-cpu:2.2.0-SNAPSHOT imagePullPolicy: IfNotPresent env: - name: CONTROLLER_HOST # fixed diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 2938ce4618b..7f2f41bd95d 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -17,7 +17,7 @@ RUN cd /tmp/ && \ mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/ -FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT +FROM intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT ARG http_proxy ARG https_proxy diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index 5b3f00cda9f..d715218d872 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -6,7 +6,7 @@ docker build \ --build-arg http_proxy=.. \ --build-arg https_proxy=.. \ --build-arg no_proxy=.. \ - --rm --no-cache -t intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT . + --rm --no-cache -t intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT . ``` @@ -18,7 +18,7 @@ To map the `xpu` into the container, you need to specify `--device=/dev/dri` whe An example could be: ```bash #/bin/bash -export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT +export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.2.0-SNAPSHOT sudo docker run -itd \ --net=host \ diff --git a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py index 2db0ff18b99..59bfb588a4e 100644 --- a/docker/llm/serving/xpu/docker/vllm_online_benchmark.py +++ b/docker/llm/serving/xpu/docker/vllm_online_benchmark.py @@ -270,7 +270,7 @@ def benchmark(llm_urls, model, prompt, num_requests, max_concurrent_requests, ma LLM_URLS = [f"http://localhost:{PORT}/v1/completions" for PORT in [8000]] -MODEL = "llm/models/" + model_name +MODEL = "/llm/models/" + model_name MAX_TOKENS = 512 PROMPT = PROMPT_1024 diff --git a/docs/mddocs/DockerGuides/docker_pytorch_inference_gpu.md b/docs/mddocs/DockerGuides/docker_pytorch_inference_gpu.md index 4d0ff0fa3a2..ec73bb84f15 100644 --- a/docs/mddocs/DockerGuides/docker_pytorch_inference_gpu.md +++ b/docs/mddocs/DockerGuides/docker_pytorch_inference_gpu.md @@ -87,7 +87,7 @@ root@arda-arc12:/# sycl-ls > export USE_XETLA=OFF > > # Enable immediate command lists mode for the Level Zero plugin. Improves performance on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series; however, it depends on the Linux Kernel, and some Linux kernels may not necessarily provide acceleration. -> # Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series, but it depends on the Linux kernel, Non-i915 kernel drivers may cause performance regressions. +> # Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series, but it depends on the Linux kernel, Upstream i915 kernel drivers may cause performance regressions. > export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 > > # Controls persistent device compiled code cache. Set to '1' to turn on and '0' to turn off. diff --git a/docs/mddocs/Overview/install_gpu.md b/docs/mddocs/Overview/install_gpu.md index a36154d0c82..ed87fdeb287 100644 --- a/docs/mddocs/Overview/install_gpu.md +++ b/docs/mddocs/Overview/install_gpu.md @@ -171,7 +171,7 @@ IPEX-LLM GPU support on Linux has been verified on: > **Tip**: > - > Please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. + > For client GPUs, such as the Intel® Arc™ A-series, please refer to [Client GPU Installation Guide](https://dgpu-docs.intel.com/driver/client/overview.html). For data center GPUs, including Intel® Data Center GPU Max Series and Intel® Data Center GPU Flex Series, please refer to our [Installation for Data Center GPU](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. > > See [release page](https://dgpu-docs.intel.com/releases/index.html) for latest version. @@ -311,7 +311,7 @@ IPEX-LLM GPU support on Linux has been verified on: > **Tip**: > - > Please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. + > For client GPUs, such as the Intel® Arc™ A-series, please refer to [Client GPU Installation Guide](https://dgpu-docs.intel.com/driver/client/overview.html). For data center GPUs, including Intel® Data Center GPU Max Series and Intel® Data Center GPU Flex Series, please refer to our [Installation for Data Center GPU](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. > > See [release page](https://dgpu-docs.intel.com/releases/index.html) for latest version. @@ -623,3 +623,9 @@ The reason for such errors is that oneAPI has not been initialized properly befo * For oneAPI installed using APT or Offline Installer, make sure you execute `setvars.sh` of oneAPI Base Toolkit before running IPEX-LLM. * For PIP-installed oneAPI, activate your working environment and run ``echo $LD_LIBRARY_PATH`` to check if the installation path is properly configured for the environment. If the output does not contain oneAPI path (e.g. ``~/intel/oneapi/lib``), check [Prerequisites](#prerequisites-1) to re-install oneAPI with PIP installer. * Make sure you install matching versions of ipex-llm/pytorch/IPEX and oneAPI Base Toolkit. IPEX-LLM with PyTorch 2.1 should be used with oneAPI Base Toolkit version 2024.0. IPEX-LLM with PyTorch 2.0 should be used with oneAPI Base Toolkit version 2023.2. + +#### 2. `core dump` when running with GPU +If encountered random `core dump` when running with GPU, please remove out of tree driver. +``` +sudo apt purge -y intel-i915-dkms intel-fw-gpu +``` \ No newline at end of file diff --git a/docs/mddocs/Quickstart/continue_quickstart.md b/docs/mddocs/Quickstart/continue_quickstart.md index d3feb289354..239af91fa25 100644 --- a/docs/mddocs/Quickstart/continue_quickstart.md +++ b/docs/mddocs/Quickstart/continue_quickstart.md @@ -33,11 +33,12 @@ Visit [Run Ollama with IPEX-LLM on Intel GPU](./ollama_quickstart.md), and follo > If the `Continue` plugin is not installed on the same machine where Ollama is running (which means `Continue` needs to connect to a remote Ollama service), you must configure the Ollama service to accept connections from any IP address. To achieve this, set or export the environment variable `OLLAMA_HOST=0.0.0.0` before executing the command `ollama serve`. > [!TIP] -> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), it is recommended to additionaly set the following environment variable for optimal performance before executing `ollama serve`: +> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), setting the following environment variable before starting the service may potentially improve performance. > > ```bash > export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 > ``` +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). ### 2. Pull and Prepare the Model diff --git a/docs/mddocs/Quickstart/fastchat_quickstart.md b/docs/mddocs/Quickstart/fastchat_quickstart.md index 431457397d5..abae5ccdc4b 100644 --- a/docs/mddocs/Quickstart/fastchat_quickstart.md +++ b/docs/mddocs/Quickstart/fastchat_quickstart.md @@ -60,6 +60,7 @@ python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOU # Available low_bit format including sym_int4, sym_int8, fp16 etc. source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF +# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --low-bit "sym_int4" --trust-remote-code --device "xpu" @@ -87,6 +88,7 @@ python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7 source /opt/intel/oneapi/setvars.sh export ENABLE_SDP_FUSION=1 export SYCL_CACHE_PERSISTENT=1 +# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 python3 -m ipex_llm.serving.fastchat.ipex_llm_worker --model-path lmsys/vicuna-7b-v1.5 --low-bit "fp16" --trust-remote-code --device "xpu" --speculative ``` @@ -117,10 +119,14 @@ python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MO # On GPU source /opt/intel/oneapi/setvars.sh export USE_XETLA=OFF +# [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 python3 -m ipex_llm.serving.fastchat.vllm_worker --model-path REPO_ID_OR_YOUR_MODEL_PATH --device xpu --load-in-low-bit "sym_int4" --enforce-eager ``` +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). + #### Launch multiple workers Sometimes we may want to start multiple workers for the best performance. For running in CPU, you may want to seperate multiple workers in different sockets. Assuming each socket have 48 physicall cores, then you may want to start two workers using the following example: diff --git a/docs/mddocs/Quickstart/graphrag_quickstart.md b/docs/mddocs/Quickstart/graphrag_quickstart.md index 1e104f6df04..52e08ffe060 100644 --- a/docs/mddocs/Quickstart/graphrag_quickstart.md +++ b/docs/mddocs/Quickstart/graphrag_quickstart.md @@ -9,12 +9,16 @@ The [GraphRAG project](https://github.com/microsoft/graphrag) is designed to lev - [Setup Python Environment for GraphRAG](#3-setup-python-environment-for-graphrag) - [Index GraphRAG](#4-index-graphrag) - [Query GraphRAG](#5-query-graphrag) +- [Query GraphRAG](#5-query-graphrag) +- [Troubleshooting](#troubleshooting) ## Quickstart ### 1. Install and Start `Ollama` Service on Intel GPU -Follow the steps in [Run Ollama with IPEX-LLM on Intel GPU Guide](./ollama_quickstart.md) to install and run Ollama on Intel GPU. Ensure that `ollama serve` is running correctly and can be accessed through a local URL (e.g., `https://127.0.0.1:11434`). +Follow the steps in [Run Ollama with IPEX-LLM on Intel GPU Guide](./ollama_quickstart.md) to install `ipex-llm[cpp]==2.1.0` and run Ollama on Intel GPU. Ensure that `ollama serve` is running correctly and can be accessed through a local URL (e.g., `https://127.0.0.1:11434`). + +**Please note that for GraphRAG, we highly recommand using the stable version of ipex-llm through `pip install ipex-llm[cpp]==2.1.0`**. ### 2. Prepare LLM and Embedding Model @@ -57,6 +61,7 @@ conda create -n graphrag-local-ollama python=3.10 conda activate graphrag-local-ollama pip install -e . +pip install future pip install ollama pip install plotly @@ -64,6 +69,9 @@ pip install plotly in which `pip install ollama` is for enabling restful APIs through python, and `pip install plotly` is for visualizing the knowledge graph. +> [!NOTE] +> Please note that the Python environment for GraphRAG setup here is separate from the one for Ollama server on Intel GPUs. + ### 4. Index GraphRAG The environment is now ready for GraphRAG with local LLMs and embedding models running on Intel GPUs. Before querying GraphRAG, it is necessary to first index GraphRAG, which could be a resource-intensive operation. @@ -114,24 +122,25 @@ Perpare the input corpus, and then initialize the workspace: #### Update `settings.yml` In the `settings.yml` file inside the `ragtest` folder, add the configuration `request_timeout: 1800.0` for `llm`. Besides, if you would like to use LLMs or embedding models other than `mistral` or `nomic-embed-text`, you are required to update the `settings.yml` in `ragtest` folder accordingly: -> -> ```yml -> llm: -> api_key: ${GRAPHRAG_API_KEY} -> type: openai_chat -> model: mistral # change it accordingly if using another LLM -> model_supports_json: true -> request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout -> api_base: http://localhost:11434/v1 -> -> embeddings: -> async_mode: threaded -> llm: -> api_key: ${GRAPHRAG_API_KEY} -> type: openai_embedding -> model: nomic_embed_text # change it accordingly if using another embedding model -> api_base: http://localhost:11434/api -> ``` + + +```yml +llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_chat + model: mistral # change it accordingly if using another LLM + model_supports_json: true + request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout + api_base: http://localhost:11434/v1 + +embeddings: + async_mode: threaded + llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_embedding + model: nomic_embed_text # change it accordingly if using another embedding model + api_base: http://localhost:11434/api +``` #### Conduct GraphRAG indexing @@ -197,3 +206,55 @@ The Transformer model has been very successful in various natural language proce Since its initial introduction, the Transformer model has been further developed and improved upon. Variants of the Transformer architecture, such as BERT (Bidirectional Encoder Representations from Transformers) and RoBERTa (Robustly Optimized BERT Pretraining Approach), have achieved state-of-the-art performance on a wide range of natural language processing tasks [Data: Reports (1, 2, 34, 46, 64, +more)]. ``` + +### Troubleshooting + +#### `failed to find free space in the KV cache, retrying with smaller n_batch` when conducting GraphRAG Indexing, and `JSONDecodeError` when querying GraphRAG + +If you observe the Ollama server log showing `failed to find free space in the KV cache, retrying with smaller n_batch` while conducting GraphRAG indexing, and receive `JSONDecodeError` when querying GraphRAG, try to increase context length for the LLM model and index/query GraphRAG again. + +Here introduce how to make the LLM model support larger context. To do this, we need to first create a file named `Modelfile`: + +``` +FROM mistral:latest +PARAMETER num_ctx 4096 +``` + +> [!TIP] +> Here we increase `num_ctx` to 4096 as an example. You could adjust it accordingly. + +and then use the following commands to create a new model in Ollama named `mistral:latest-nctx4096`: + +- For **Linux users**: + + ```bash + ./ollama create mistral:latest-nctx4096 -f Modelfile + ``` + +- For **Windows users**: + + Please run the following command in Miniforge or Anaconda Prompt. + + ```cmd + ollama create mistral:latest-nctx4096 -f Modelfile + ``` + +Finally, update `settings.yml` inside the `ragtest` folder to use `llm` model `mistral:latest-nctx4096`: + +```yml +llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_chat + model: mistral:latest-nctx4096 # change it accordingly if using another LLM, or LLM model with larger num_ctx + model_supports_json: true + request_timeout: 1800.0 # add this configuration; you could also increase the request_timeout + api_base: http://localhost:11434/v1 + +embeddings: + async_mode: threaded + llm: + api_key: ${GRAPHRAG_API_KEY} + type: openai_embedding + model: nomic_embed_text # change it accordingly if using another embedding model + api_base: http://localhost:11434/api +``` \ No newline at end of file diff --git a/docs/mddocs/Quickstart/install_linux_gpu.md b/docs/mddocs/Quickstart/install_linux_gpu.md index a71ef9efc37..88574d8db19 100644 --- a/docs/mddocs/Quickstart/install_linux_gpu.md +++ b/docs/mddocs/Quickstart/install_linux_gpu.md @@ -34,13 +34,17 @@ IPEX-LLM currently supports the Ubuntu 20.04 operating system and later, and sup ```bash sudo apt-get update + + # Install out-of-tree driver sudo apt-get -y install \ gawk \ dkms \ linux-headers-$(uname -r) \ libc6-dev + sudo apt install intel-i915-dkms intel-fw-gpu - sudo apt-get install -y gawk libc6-dev udev\ + # Install Compute Runtime + sudo apt-get install -y udev \ intel-opencl-icd intel-level-zero-gpu level-zero \ intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ @@ -82,13 +86,17 @@ IPEX-LLM currently supports the Ubuntu 20.04 operating system and later, and sup ```bash sudo apt-get update + + # Install out-of-tree driver sudo apt-get -y install \ gawk \ dkms \ linux-headers-$(uname -r) \ libc6-dev + sudo apt install -y intel-i915-dkms intel-fw-gpu - sudo apt-get install -y gawk libc6-dev udev\ + # Install Compute Runtime + sudo apt-get install -y udev \ intel-opencl-icd intel-level-zero-gpu level-zero \ intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ @@ -234,8 +242,9 @@ To use GPU acceleration on Linux, several environment variables are required or # Recommended Environment Variables for optimal performance export USE_XETLA=OFF - export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_CACHE_PERSISTENT=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation + export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ``` - For **Intel Data Center GPU Max**: @@ -249,9 +258,10 @@ To use GPU acceleration on Linux, several environment variables are required or # Recommended Environment Variables for optimal performance export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so - export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_CACHE_PERSISTENT=1 export ENABLE_SDP_FUSION=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation + export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ``` Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10` @@ -259,6 +269,8 @@ To use GPU acceleration on Linux, several environment variables are required or > [!NOTE] > Please refer to [this guide](../Overview/install_gpu.md#runtime-configuration-1) for more details regarding runtime configuration. +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). ## A Quick Example diff --git a/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md b/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md index 40fcd31f7bf..0d5858746aa 100644 --- a/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md +++ b/docs/mddocs/Quickstart/llama3_llamacpp_ollama_quickstart.md @@ -51,6 +51,7 @@ To use GPU acceleration, several environment variables are required or recommend ```bash source /opt/intel/oneapi/setvars.sh export SYCL_CACHE_PERSISTENT=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance export ONEAPI_DEVICE_SELECTOR=level_zero:0 @@ -62,12 +63,16 @@ To use GPU acceleration, several environment variables are required or recommend ```cmd set SYCL_CACHE_PERSISTENT=1 + rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ``` > [!TIP] > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector). +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). + ##### Run llama3 Under your current directory, exceuting below command to do inference with Llama3: @@ -75,7 +80,7 @@ Under your current directory, exceuting below command to do inference with Llama - For **Linux users**: ```bash - ./main -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun doing something" -t 8 -e -ngl 33 --color --no-mmap + ./llama-cli -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun doing something" -c 1024 -t 8 -e -ngl 33 --color --no-mmap ``` - For **Windows users**: @@ -83,7 +88,7 @@ Under your current directory, exceuting below command to do inference with Llama Please run the following command in Miniforge Prompt. ```cmd - main -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun doing something" -e -ngl 33 --color --no-mmap + llama-cli -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun doing something" -c 1024 -e -ngl 33 --color --no-mmap ``` Under your current directory, you can also execute below command to have interactive chat with Llama3: @@ -91,7 +96,7 @@ Under your current directory, you can also execute below command to have interac - For **Linux users**: ```bash - ./main -ngl 33 --interactive-first --color -e --in-prefix '<|start_header_id|>user<|end_header_id|>\n\n' --in-suffix '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' -r '<|eot_id|>' -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf + ./llama-cli -ngl 33 --interactive-first --color -e --in-prefix '<|start_header_id|>user<|end_header_id|>\n\n' --in-suffix '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' -r '<|eot_id|>' -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -c 1024 ``` - For **Windows users**: @@ -99,7 +104,7 @@ Under your current directory, you can also execute below command to have interac Please run the following command in Miniforge Prompt. ```cmd - main -ngl 33 --interactive-first --color -e --in-prefix "<|start_header_id|>user<|end_header_id|>\n\n" --in-suffix "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" -r "<|eot_id|>" -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf + llama-cli -ngl 33 --interactive-first --color -e --in-prefix "<|start_header_id|>user<|end_header_id|>\n\n" --in-suffix "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" -r "<|eot_id|>" -m /Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -c 1024 ``` Below is a sample output on Intel Arc GPU: @@ -131,6 +136,7 @@ Launch the Ollama service: export OLLAMA_NUM_GPU=999 source /opt/intel/oneapi/setvars.sh export SYCL_CACHE_PERSISTENT=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance export ONEAPI_DEVICE_SELECTOR=level_zero:0 @@ -147,6 +153,7 @@ Launch the Ollama service: set ZES_ENABLE_SYSMAN=1 set OLLAMA_NUM_GPU=999 set SYCL_CACHE_PERSISTENT=1 + rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ollama serve @@ -160,6 +167,8 @@ Launch the Ollama service: > [!TIP] > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector). +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). ##### 2.2.2 Using Ollama Run Llama3 diff --git a/docs/mddocs/Quickstart/llama_cpp_quickstart.md b/docs/mddocs/Quickstart/llama_cpp_quickstart.md index 9b27c481ef1..cb0bdedb437 100644 --- a/docs/mddocs/Quickstart/llama_cpp_quickstart.md +++ b/docs/mddocs/Quickstart/llama_cpp_quickstart.md @@ -1,6 +1,6 @@ # Run llama.cpp with IPEX-LLM on Intel GPU -[ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) prvoides fast LLM inference in in pure C++ across a variety of hardware; you can now use the C++ interface of [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) as an accelerated backend for `llama.cpp` running on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*. +[ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) prvoides fast LLM inference in pure C++ across a variety of hardware; you can now use the C++ interface of [`ipex-llm`](https://github.com/intel-analytics/ipex-llm) as an accelerated backend for `llama.cpp` running on Intel **GPU** *(e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max)*. See the demo of running LLaMA2-7B on Intel Arc GPU below. @@ -14,9 +14,9 @@ See the demo of running LLaMA2-7B on Intel Arc GPU below. > [!NOTE] -> `ipex-llm[cpp]==2.5.0b20240527` is consistent with [c780e75](https://github.com/ggerganov/llama.cpp/commit/c780e75305dba1f67691a8dc0e8bc8425838a452) of llama.cpp. +> `ipex-llm[cpp]==2.2.0b20240826` is consistent with [62bfef5](https://github.com/ggerganov/llama.cpp/commit/62bfef5194d5582486d62da3db59bf44981b7912) of llama.cpp. > -> Our latest version is consistent with [62bfef5](https://github.com/ggerganov/llama.cpp/commit/62bfef5194d5582486d62da3db59bf44981b7912) of llama.cpp. +> Our latest version is consistent with [a1631e5](https://github.com/ggerganov/llama.cpp/commit/a1631e53f6763e17da522ba219b030d8932900bd) of llama.cpp. ## Table of Contents - [Prerequisites](./llama_cpp_quickstart.md#0-prerequisites) @@ -25,8 +25,6 @@ See the demo of running LLaMA2-7B on Intel Arc GPU below. - [Example: Running community GGUF models with IPEX-LLM](./llama_cpp_quickstart.md#3-example-running-community-gguf-models-with-ipex-llm) - [Troubleshooting](./llama_cpp_quickstart.md#troubleshooting) - - ## Quick Start This quickstart guide walks you through installing and running `llama.cpp` with `ipex-llm`. @@ -117,6 +115,7 @@ To use GPU acceleration, several environment variables are required or recommend ```bash source /opt/intel/oneapi/setvars.sh export SYCL_CACHE_PERSISTENT=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance export ONEAPI_DEVICE_SELECTOR=level_zero:0 @@ -128,12 +127,16 @@ To use GPU acceleration, several environment variables are required or recommend ```cmd set SYCL_CACHE_PERSISTENT=1 + rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ``` > [!TIP] > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector). +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). + ### 3. Example: Running community GGUF models with IPEX-LLM Here we provide a simple example to show how to run a community GGUF model with IPEX-LLM. @@ -146,7 +149,7 @@ Before running, you should download or copy community GGUF model to your current - For **Linux users**: ```bash - ./main -m mistral-7b-instruct-v0.1.Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" -t 8 -e -ngl 99 --color + ./llama-cli -m mistral-7b-instruct-v0.1.Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" -c 1024 -t 8 -e -ngl 99 --color ``` > **Note**: @@ -158,7 +161,7 @@ Before running, you should download or copy community GGUF model to your current Please run the following command in Miniforge Prompt. ```cmd - main -m mistral-7b-instruct-v0.1.Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" -t 8 -e -ngl 99 --color + llama-cli -m mistral-7b-instruct-v0.1.Q4_K_M.gguf -n 32 --prompt "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" -c 1024 -t 8 -e -ngl 99 --color ``` > **Note**: @@ -168,24 +171,10 @@ Before running, you should download or copy community GGUF model to your current #### Sample Output ``` Log start -main: build = 1 (38bcbd4) -main: built with Intel(R) oneAPI DPC++/C++ Compiler 2024.0.0 (2024.0.0.20231017) for x86_64-unknown-linux-gnu -main: seed = 1710359960 -ggml_init_sycl: GGML_SYCL_DEBUG: 0 -ggml_init_sycl: GGML_SYCL_F16: no -found 8 SYCL devices: -|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size| -|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------| -| 0| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| -| 1| Intel(R) FPGA Emulation Device| 1.2| 32| 67108864| 64| 67181625344| -| 2| 13th Gen Intel(R) Core(TM) i9-13900K| 3.0| 32| 8192| 64| 67181625344| -| 3| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 4| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 5| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53745299456| -| 6| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| -| 7| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53745299456| -detect 2 SYCL GPUs: [0,6] with Max compute units:512 -llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ~/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2) +main: build = 1 (6f4ec98) +main: built with MSVC 19.39.33519.0 for +main: seed = 1724921424 +llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from D:\gguf-models\mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = mistralai_mistral-7b-instruct-v0.1 @@ -210,18 +199,21 @@ llama_model_loader: - kv 19: general.quantization_version u32 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q4_K: 193 tensors llama_model_loader: - type q6_K: 33 tensors -llm_load_vocab: special tokens definition check successful ( 259/32000 ). +llm_load_vocab: special tokens cache size = 3 +llm_load_vocab: token to piece cache size = 0.1637 MB llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 +llm_load_print_meta: vocab_only = 0 llm_load_print_meta: n_ctx_train = 32768 llm_load_print_meta: n_embd = 4096 +llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 8 -llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 +llm_load_print_meta: n_swa = 0 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 4 @@ -231,98 +223,135 @@ llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 +llm_load_print_meta: f_logit_scale = 0.0e+00 llm_load_print_meta: n_ff = 14336 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 -llm_load_print_meta: causal attm = 1 +llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 -llm_load_print_meta: n_yarn_orig_ctx = 32768 +llm_load_print_meta: n_ctx_orig_yarn = 32768 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 +llm_load_print_meta: ssm_dt_b_c_rms = 0 llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q4_K - Medium llm_load_print_meta: model params = 7.24 B -llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) +llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) llm_load_print_meta: general.name = mistralai_mistral-7b-instruct-v0.1 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' -get_memory_info: [warning] ext_intel_free_memory is not supported (export/set ZES_ENABLE_SYSMAN=1 to support), use total memory as free memory -get_memory_info: [warning] ext_intel_free_memory is not supported (export/set ZES_ENABLE_SYSMAN=1 to support), use total memory as free memory -llm_load_tensors: ggml ctx size = 0.33 MiB +llm_load_print_meta: max token length = 48 +ggml_sycl_init: GGML_SYCL_FORCE_MMQ: no +ggml_sycl_init: SYCL_USE_XMX: yes +ggml_sycl_init: found 1 SYCL devices: +llm_load_tensors: ggml ctx size = 0.27 MiB llm_load_tensors: offloading 32 repeating layers to GPU llm_load_tensors: offloading non-repeating layers to GPU llm_load_tensors: offloaded 33/33 layers to GPU -llm_load_tensors: SYCL0 buffer size = 2113.28 MiB -llm_load_tensors: SYCL6 buffer size = 1981.77 MiB -llm_load_tensors: SYCL_Host buffer size = 70.31 MiB -............................................................................................... +llm_load_tensors: SYCL0 buffer size = 4095.05 MiB +llm_load_tensors: CPU buffer size = 70.31 MiB +.............................................................................................. llama_new_context_with_model: n_ctx = 512 +llama_new_context_with_model: n_batch = 512 +llama_new_context_with_model: n_ubatch = 512 +llama_new_context_with_model: flash_attn = 0 llama_new_context_with_model: freq_base = 10000.0 llama_new_context_with_model: freq_scale = 1 -llama_kv_cache_init: SYCL0 KV buffer size = 34.00 MiB -llama_kv_cache_init: SYCL6 KV buffer size = 30.00 MiB +[SYCL] call ggml_check_sycl +ggml_check_sycl: GGML_SYCL_DEBUG: 0 +ggml_check_sycl: GGML_SYCL_F16: no +found 1 SYCL devices: +| | | | |Max | |Max |Global | | +| | | | |compute|Max work|sub |mem | | +|ID| Device Type| Name|Version|units |group |group|size | Driver version| +|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------| +| 0| [level_zero:gpu:0]| Intel Arc Graphics| 1.3| 112| 1024| 32| 13578M| 1.3.27504| +llama_kv_cache_init: SYCL0 KV buffer size = 64.00 MiB llama_new_context_with_model: KV self size = 64.00 MiB, K (f16): 32.00 MiB, V (f16): 32.00 MiB -llama_new_context_with_model: SYCL_Host input buffer size = 10.01 MiB -llama_new_context_with_model: SYCL0 compute buffer size = 73.00 MiB -llama_new_context_with_model: SYCL6 compute buffer size = 73.00 MiB -llama_new_context_with_model: SYCL_Host compute buffer size = 8.00 MiB -llama_new_context_with_model: graph splits (measure): 3 -system_info: n_threads = 8 / 32 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | -sampling: - repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000 +llama_new_context_with_model: SYCL_Host output buffer size = 0.12 MiB +llama_new_context_with_model: SYCL0 compute buffer size = 81.00 MiB +llama_new_context_with_model: SYCL_Host compute buffer size = 9.01 MiB +llama_new_context_with_model: graph nodes = 902 +llama_new_context_with_model: graph splits = 2 + +system_info: n_threads = 8 / 18 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | +sampling: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 -sampling order: -CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature -generate: n_ctx = 512, n_batch = 512, n_predict = 32, n_keep = 1 - Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun exploring the world around her. Her parents were kind and let her do what she wanted, as long as she stayed safe. -One day, the little -llama_print_timings: load time = 10096.78 ms +sampling order: +CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature +generate: n_ctx = 512, n_batch = 2048, n_predict = 32, n_keep = 1 + + + Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun exploring the world. She lived in a small village where there weren't many opportunities for adventures, but that didn't stop her. She would often read +llama_print_timings: load time = xxxx ms llama_print_timings: sample time = x.xx ms / 32 runs ( xx.xx ms per token, xx.xx tokens per second) llama_print_timings: prompt eval time = xx.xx ms / 31 tokens ( xx.xx ms per token, xx.xx tokens per second) llama_print_timings: eval time = xx.xx ms / 31 runs ( xx.xx ms per token, xx.xx tokens per second) llama_print_timings: total time = xx.xx ms / 62 tokens Log end + ``` ### Troubleshooting -#### Fail to quantize model +#### 1. Unable to run the initialization script +If you are unable to run `init-llama-cpp.bat`, please make sure you have installed `ipex-llm[cpp]` in your conda environment. If you have installed it, please check if you have activated the correct conda environment. Also, if you are using Windows, please make sure you have run the script with administrator privilege in prompt terminal. + +#### 2. `DeviceList is empty. -30 (PI_ERROR_INVALID_VALUE)` error +On Linux, this error happens when devices starting with `[ext_oneapi_level_zero]` are not found. Please make sure you have installed level-zero, and have sourced `/opt/intel/oneapi/setvars.sh` before running the command. + +#### 3. `Prompt is too long` error +If you encounter `main: prompt is too long (xxx tokens, max xxx)`, please increase the `-c` parameter to set a larger size of context. + +#### 4. `gemm: cannot allocate memory on host` error / `could not create an engine` error +If you meet `oneapi::mkl::oneapi::mkl::blas::gemm: cannot allocate memory on host` error, or `could not create an engine` on Linux, this is probably caused by pip installed OneAPI dependencies. You should prevent installing like `pip install dpcpp-cpp-rt==2024.0.2 mkl-dpcpp==2024.0.0 onednn==2024.0.0`, and instead use `apt` to install on Linux. Please refer to [this guide](./install_linux_gpu.md) for more details. + +#### 5. Fail to quantize model If you encounter `main: failed to quantize model from xxx`, please make sure you have created related output directory. -#### Program hang during model loading +#### 6. Program hang during model loading If your program hang after `llm_load_tensors: SYCL_Host buffer size = xx.xx MiB`, you can add `--no-mmap` in your command. -#### How to set `-ngl` parameter +#### 7. How to set `-ngl` parameter `-ngl` means the number of layers to store in VRAM. If your VRAM is enough, we recommend putting all layers on GPU, you can just set `-ngl` to a large number like 999 to achieve this goal. If `-ngl` is set to 0, it means that the entire model will run on CPU. If `-ngl` is set to greater than 0 and less than model layers, then it's mixed GPU + CPU scenario. -#### How to specificy GPU +#### 8. How to specificy GPU If your machine has multi GPUs, `llama.cpp` will default use all GPUs which may slow down your inference for model which can run on single GPU. You can add `-sm none` in your command to use one GPU only. Also, you can use `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]` to select device before excuting your command, more details can refer to [here](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector). -#### Program crash with Chinese prompt +#### 9. Program crash with Chinese prompt If you run the llama.cpp program on Windows and find that your program crashes or outputs abnormally when accepting Chinese prompts, you can open `Region->Administrative->Change System locale..`, check `Beta: Use Unicode UTF-8 for worldwide language support` option and then restart your computer. For detailed instructions on how to do this, see [this issue](https://github.com/intel-analytics/ipex-llm/issues/10989#issuecomment-2105600469). -#### sycl7.dll not found error +#### 10. sycl7.dll not found error If you meet `System Error: sycl7.dll not found` on Windows or you meet similar error on Linux, please check: 1. if you have installed conda and if you are in the right conda environment which has pip installed oneapi dependencies on Windows 2. if you have executed `source /opt/intel/oneapi/setvars.sh` on Linux -#### Check driver first when you meet garbage output +#### 11. Check driver first when you meet garbage output If you meet garbage output, please check if your GPU driver version is >= [31.0.101.5522](https://www.intel.cn/content/www/cn/zh/download/785597/823163/intel-arc-iris-xe-graphics-windows.html). If not, please follow the instructions in [this section](./install_linux_gpu.md#install-gpu-driver) to update your GPU driver. +#### 12. Why my program can't find sycl device +If you meet `GGML_ASSERT: C:/Users/Administrator/actions-runner/cpp-release/_work/llm.cpp/llm.cpp/llama-cpp-bigdl/ggml-sycl.cpp:18283: main_gpu_id > [!NOTE] -> `ipex-llm[cpp]==2.5.0b20240527` is consistent with [v0.1.34](https://github.com/ollama/ollama/releases/tag/v0.1.34) of ollama. +> `ipex-llm[cpp]==2.2.0b20240826` is consistent with [v0.1.39](https://github.com/ollama/ollama/releases/tag/v0.1.39) of ollama. > -> Our current version is consistent with [v0.1.39](https://github.com/ollama/ollama/releases/tag/v0.1.39) of ollama. +> Our current version is consistent with [v0.3.6](https://github.com/ollama/ollama/releases/tag/v0.3.6) of ollama. ## Table of Contents - [Install IPEX-LLM for Ollama](./ollama_quickstart.md#1-install-ipex-llm-for-ollama) @@ -72,6 +72,7 @@ You may launch the Ollama service as below: export ZES_ENABLE_SYSMAN=1 source /opt/intel/oneapi/setvars.sh export SYCL_CACHE_PERSISTENT=1 + # [optional] under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # [optional] if you want to run on single GPU, use below command to limit GPU may improve performance export ONEAPI_DEVICE_SELECTOR=level_zero:0 @@ -88,6 +89,7 @@ You may launch the Ollama service as below: set no_proxy=localhost,127.0.0.1 set ZES_ENABLE_SYSMAN=1 set SYCL_CACHE_PERSISTENT=1 + rem under most circumstances, the following environment variable may improve performance, but sometimes this may also cause performance degradation set SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 ollama serve @@ -102,6 +104,10 @@ You may launch the Ollama service as below: > [!TIP] > When your machine has multi GPUs and you want to run on one of them, you need to set `ONEAPI_DEVICE_SELECTOR=level_zero:[gpu_id]`, here `[gpu_id]` varies based on your requirement. For more details, you can refer to [this section](../Overview/KeyFeatures/multi_gpus_selection.md#2-oneapi-device-selector). +> [!NOTE] +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). + + The console will display messages similar to the following: @@ -185,22 +191,24 @@ An example process of interacting with model with `ollama run example` looks lik ### Troubleshooting +#### 1. Unable to run the initialization script +If you are unable to run `init-ollama.bat`, please make sure you have installed `ipex-llm[cpp]` in your conda environment. If you have installed it, please check if you have activated the correct conda environment. Also, if you are using Windows, please make sure you have run the script with administrator privilege in prompt terminal. -#### Why model is always loaded again after several minutes +#### 2. Why model is always loaded again after several minutes Ollama will unload model from gpu memory in every 5 minutes as default. For latest version of ollama, you could set `OLLAMA_KEEP_ALIVE=-1` to keep the model loaded in memory. Reference issue: https://github.com/intel-analytics/ipex-llm/issues/11608 -#### `exit status 0xc0000135` error when executing `ollama serve` +#### 3. `exit status 0xc0000135` error when executing `ollama serve` When executing `ollama serve`, if you meet `llama runner process has terminated: exit status 0xc0000135` on Windows or you meet `ollama_llama_server: error while loading shared libraries: libmkl_core.so.2: cannot open shared object file` on Linux, this is most likely caused by the lack of sycl dependency. Please check: 1. if you have installed conda and if you are in the right conda environment which has pip installed oneapi dependencies on Windows 2. if you have executed `source /opt/intel/oneapi/setvars.sh` on Linux -#### Program hang during initial model loading stage +#### 4. Program hang during initial model loading stage When launching `ollama serve` for the first time on Windows, it may get stuck during the model loading phase. If you notice that the program is hanging for a long time during the first run, you can manually input a space or other characters on the server side to ensure the program is running. -#### How to distinguish the community version of Ollama from the ipex-llm version of Ollama +#### 5. How to distinguish the community version of Ollama from the ipex-llm version of Ollama In the server log of community version of Ollama, you may see `source=payload_common.go:139 msg="Dynamic LLM libraries [rocm_v60000 cpu_avx2 cuda_v11 cpu cpu_avx]"`. But in the server log of ipex-llm version of Ollama, you should only see `source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2]"`. -#### Ollama hang when multiple different questions is asked or context is long +#### 6. Ollama hang when multiple different questions is asked or context is long If you find ollama hang when multiple different questions is asked or context is long, and you see `update_slots : failed to free spaces in the KV cache` in the server log, this could be because that sometimes the LLM context is larger than the default `n_ctx` value, you may increase the `n_ctx` and try it again. \ No newline at end of file diff --git a/docs/mddocs/Quickstart/vLLM_quickstart.md b/docs/mddocs/Quickstart/vLLM_quickstart.md index 764b35c10a7..0982ba6b769 100644 --- a/docs/mddocs/Quickstart/vLLM_quickstart.md +++ b/docs/mddocs/Quickstart/vLLM_quickstart.md @@ -171,11 +171,12 @@ Below shows an example output using `Qwen1.5-7B-Chat` with low-bit format `sym_i > [!TIP] -> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), it is recommended to additionaly set the following environment variable for optimal performance before starting the service: +> If your local LLM is running on Intel Arc™ A-Series Graphics with Linux OS (Kernel 6.2), setting the following environment variable before starting the service may potentially improve performance. > > ```bash > export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 > ``` +> The environment variable `SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS` determines the usage of immediate command lists for task submission to the GPU. While this mode typically enhances performance, exceptions may occur. Please consider experimenting with and without this environment variable for best performance. For more details, you can refer to [this article](https://www.intel.com/content/www/us/en/developer/articles/guide/level-zero-immediate-command-lists.html). ### 4. About Tensor Parallel diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md index b4e2c3d501d..58e92a49437 100644 --- a/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md +++ b/docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md @@ -25,13 +25,17 @@ IPEX-LLM currently supports the Ubuntu 20.04 operating system and later, and sup ```bash sudo apt-get update + + # Install out-of-tree driver sudo apt-get -y install \ gawk \ dkms \ linux-headers-$(uname -r) \ libc6-dev + sudo apt install intel-i915-dkms intel-fw-gpu - sudo apt-get install -y gawk libc6-dev udev\ + # Install Compute Runtime + sudo apt-get install -y udev \ intel-opencl-icd intel-level-zero-gpu level-zero \ intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ @@ -73,13 +77,17 @@ IPEX-LLM currently supports the Ubuntu 20.04 operating system and later, and sup ```bash sudo apt-get update + + # Install out-of-tree driver sudo apt-get -y install \ gawk \ dkms \ linux-headers-$(uname -r) \ libc6-dev + sudo apt install intel-i915-dkms intel-fw-gpu - sudo apt-get install -y gawk libc6-dev udev\ + # Install Compute Runtime + sudo apt-get install -y udev \ intel-opencl-icd intel-level-zero-gpu level-zero \ intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ diff --git a/python/llm/dev/benchmark/all-in-one/README.md b/python/llm/dev/benchmark/all-in-one/README.md index f1927e8e7d1..d17332b472c 100644 --- a/python/llm/dev/benchmark/all-in-one/README.md +++ b/python/llm/dev/benchmark/all-in-one/README.md @@ -4,6 +4,8 @@ All in one benchmark test allows users to test all the benchmarks and record the Before running, make sure to have [ipex-llm](../../../../../README.md) installed. +> The prompts for benchmarking are from datasets [abisee/cnn_dailymail](https://huggingface.co/datasets/abisee/cnn_dailymail), [Open-Orca/OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca), [THUDM/LongBench](https://huggingface.co/datasets/THUDM/LongBench), etc. + ## Dependencies ```bash diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index db302e7364f..e94d001e6fd 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -11,6 +11,7 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) batch_size: 1 # default to 1 in_out_pairs: - '32-32' + - '960-64' - '1024-128' test_api: - "transformer_int4_fp16_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp16) @@ -37,5 +38,6 @@ test_api: # - "transformers_int4_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) cpu_embedding: False # whether put embedding to CPU streaming: False # whether output in streaming way (only available now for gpu win related test_api) +optimize_model: False # whether apply further optimization on NPU (only available now for transformers_int4_npu_win test_api) use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only available now for "pipeline_parallel_gpu" test_api) task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_2k.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_2k.txt new file mode 100644 index 00000000000..ce70bcba3b6 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_2k.txt @@ -0,0 +1,8 @@ +You are given a report by a government agency. Write a one-page summary of the report. + +Report: +Justice Management Division (JMD) JMD provides the Federal Bureau of Prisons senior management with guidance as it relates to Department of Justice (DOJ) policy for all matters pertaining to organization, management, and administration, including the use of human capital flexibilities such as retention incentives. BOP is responsible for incarcerating all federal offenders sentenced to prison. To carry out its mission, BOP, under the oversight of DOJ’s JMD, manages the human resource operations of its institutions, including the use of retention incentives. BOP administers, monitors, and oversees retention incentives through its Central Office, regional offices, and institutions. Central Office. The Central Office serves as BOP’s headquarters and provides oversight of BOP operations and program areas. Within the Central Office is BOP’s Human Resource Management Division (HRMD) which is responsible for developing, implementing and administering human resource policies and programs, including the use of retention incentives that meet OPM and DOJ requirements. In addition, the Central Office’s Program Review Division (PRD) is responsible for assessing BOP programs, including human resources, to ensure that they are managed and operated effectively. Regional offices. BOP has six regional offices that cover the Mid- Atlantic, North Central, Northeast, South Central, Southeast, and Western regions of the United States. These offices, each led by a regional director, oversee the operations of the 122 federal institutions within their respective geographic regions of the country. According to BOP officials, regional office staff also provide local level oversight of institutions’ human capital programs, such as retention incentives, among other things. Institutions. BOP institutions are managed by a warden and other officials, including an executive assistant and associate warden who generally provide overall direction and, in part, administer the institution’s human capital policies, including policies on retention incentives. Correctional services staff represent the largest segment of each institution’s workforce and are responsible for the correctional treatment, custody, and supervision of inmates. Non-correctional services staff include, among others, those employees assigned to non-correctional services management, facility operations, and the health services unit. Workers in health services and psychology services are responsible for providing inmates with medical, dental, and mental health services and include, for example, dentists, pharmacists, physicians, nurses, psychologists, and drug treatment specialists. The Federal Employees Pay Comparability Act of 1990 first authorized OPM to allow federal agencies to give incentives, including retention incentives, to employees. The Federal Workforce Flexibility Act of 2004 provided federal agencies increased flexibilities regarding these incentives. For example, individual retention incentives that were capped at 25 percent of an employee’s basic pay rate could be increased up to 50 percent in cases of critical agency need with OPM’s approval. Generally, under OPM regulations, an agency is authorized to pay a retention incentive to employees. This happens when the agency determines that the unusually high or unique qualifications of the employee or a special need of the agency for the employee’s services makes it essential to retain the employee and that the employee would be likely to leave federal service in the absence of an incentive. In addition, OPM requires agencies to develop plans for using retention incentives outlining, in part, the required documentation for justifying the retention incentive and any criteria for determining the amount of incentive and the length of the service period. Generally, agencies must require that employees sign a written service agreement that outlines the terms of the service such as the employee’s agreement to remain a certain length of time with the agency. Additionally, according to OPM regulations, to qualify for a retention incentive, each employee must have a performance rating of at least “fully successful” or an agency’s equivalent performance rating. BOP funds the majority of its retention incentives through its Salaries and Expenses appropriation account which represented almost 93 percent of BOP’s budget in FY 2016. According to BOP officials, BOP’s Central Office allocates funding from the Salaries and Expenses account to the regional offices. These regional offices then determine how to allocate their budget among various salary and expense activities, including retention incentives. HRMD delegates retention incentive determinations to each institution. In accordance with OPM requirements and BOP’s October 2016 Program Statement on Compensation, the wardens make retention incentive requests based on documented evidence that the employee possesses unusually high or unique qualifications or meets a special need of the agency and has a performance rating of at least “successful or its equivalent.” These incentives are calculated as a percentage of the employee’s basic pay and are disbursed in installments to the employee each pay period. In addition to retention incentives, BOP has authority to provide other compensation-based human capital flexibilities to employees, in certain circumstances. The following summarizes some of the compensation- based human capital flexibilities that BOP uses in addition to retention incentives, to retain and recruit staff: Recruitment and relocation incentives. BOP pays recruitment incentives to new hires and relocation incentives to current employees who elect to move to a different geographic area, when a position is likely to be difficult to fill in the absence of an incentive. Student loan repayments. Using this authority, BOP may repay federally-insured student loans to attract job candidates or retain current employees. Special salary rates. With OPM approval, BOP may establish higher rates of pay for an occupation or group of occupations nationwide or in a local area when it finds the government’s recruitment or retention efforts are, or would likely become, significantly handicapped without those higher rates. Physicians and dental comparability allowances. Comparability allowances may be paid to certain eligible physicians or dental professionals who enter into service agreements. These allowances are paid only to categories of physicians and dentists for which the agency is experiencing recruitment and retention problems and are fixed at the minimum amounts necessary to deal with such problems. BOP retention incentive expenditures generally increased from $10.7 million in fiscal year 2012 to $14.0 million in fiscal year 2016. Additionally, as illustrated in table 1, the number of employees who received retention incentives increased each year from 2,024 employees in fiscal year 2012 to 2,460 employees in fiscal year 2016. In general, BOP employees who received retention incentives received the incentive for more than one year. For example, from fiscal year 2012 through fiscal year 2016, a total of 3,382 BOP employees received retention incentive payments. Of those, 82 percent (2,766 of 3,382) received retention incentive payments for at least 2 years and 39 percent received retention incentives all 5 years, as shown in figure 1. From fiscal years 2012 through 2016, BOP spent more than 97 percent of its total retention incentive expenditures on employees at four California institutions and for medical professionals nationwide. BOP’s total retention incentive expenditures for the four California institutions and medical professionals nationwide in fiscal year 2016 are provided in figure 2. Four California Institutions. The California institutions—United States Penitentiary (USP) Atwater, Federal Correctional Institution (FCI) Herlong, FCI Mendota, and Federal Correctional Complex (FCC) Victorville—constituted the largest portion of BOP’s total retention incentive expenditures, and the level of their expenditures remained relatively steady from fiscal year 2012 through 2016. BOP provides group retention incentives for staff at the General Schedule (GS) grades level 12 and below and those in the Federal Wage System at three institutions—USP Atwater, FCI Herlong, and FCC Victorville. BOP also provides individual retention incentives to its employees at GS grades level 12 and below and in the Federal Wage System at FCI Mendota. As shown in figure 3, our analysis of BOP data found that from fiscal years 2012 through 2016, these four California institutions had the largest percentage of retention incentive expenditures across institutions as well as the largest percentage of employees who received retention incentives. Additionally, the four California institutions’ retention incentive expenditures remained relatively steady—around $8.1 to $8.2 million during the 5-year period—even though the overall number of employees who received the incentives generally increased. BOP officials told us that these California institutions’ retention incentive expenditures remained relatively steady in spite of an overall increase in the number of employees receiving incentives, in part, because in fiscal year 2013 BOP reduced the retention incentive rate—the percentage of an employee’s basic pay that determines the employee’s retention incentive— by 3 percent at the four California institutions. BOP officials reported using retention incentives primarily at these four institutions to supplement correctional officers’ salaries and compensate for the gap between BOP’s and other institutions’ salaries. Specifically, officials told us that these four California institutions were consistently understaffed as a result of their lower salaries in comparison to salaries offered at California state and local prisons and at other BOP institutions in California metropolitan areas. The Department of Labor’s Bureau of Labor Statistics reports that the average salary for correctional officers in California in 2016 was $70,020. For the same year, the annual average salary for BOP correctional officers at these four institutions was $50,859. To bring these four California institutions’ salaries in line with those offered by state, local, and other BOP institutions in California metropolitan areas, BOP officials told us that they first use recruitment incentives to attract and hire staff and then provide retention incentives to employees with a performance rating of at least “successful.” Medical Professionals. From fiscal years 2012 through 2016, BOP retention incentive expenditures for medical professionals increased by an average of approximately 21 percent per year. Our analysis showed that most recently—for fiscal years 2015 and 2016—BOP retention incentive expenditures for medical professionals accounted for the largest portion of BOP’s total retention incentive expenditures across the various occupation groups and was primarily responsible for the overall increase in BOP’s total retention incentive expenditures from fiscal year 2012 through fiscal year 2016. For example, in fiscal year 2016, BOP spent approximately 42 percent of total retention incentives expenditures for medical professionals ($5.8 million), 27 percent on correctional officers ($3.8 million), and the remaining 31 percent on employees in other occupations. In total, BOP retention incentive expenditures for medical professionals increased from approximately $2.7 million in fiscal year 2012 to $5.8 million in fiscal year 2016, as shown in figure 4. The increase accounted for 92 percent of BOP’s total increase in retention incentive expenditures during the five-year period. In comparison, BOP’s retention incentive expenditures for correctional officers and all other occupations remained relatively steady from fiscal year 2012 through fiscal year 2016, increasing by an average of approximately 1 percent per year. According to our analysis, the increase in retention incentive expenditures for medical professionals during the five years is partially explained by the increase in the number of institutions providing retention incentives to medical professionals. Specifically, from fiscal years 2012 through 2016, the number of institutions providing retention incentives to medical professionals increased from 53 institutions with 341 employees in medical occupations receiving retention incentives to 84 institutions providing retention incentives to a total of 646 employees in medical occupations. According to BOP officials, BOP primarily uses retention incentives for medical professionals in an effort to retain these employees by supplementing BOP salaries which are generally lower than salaries offered to medical professionals in the private sector. Officials told us that BOP has designated medical professional positions as hard-to-fill and, therefore, BOP retaining these professionals in a correctional setting requires the use of a variety of incentives, including retention incentives, in order to increase pay. BOP has a number of internal controls in place to ensure that retention incentive applications meet BOP and other requirements. BOP officials told us that these controls are part of a multilayered application and review process that begins at the institution and culminates at BOP’s Central Office. Our review of a random sample of 40 application packet case files for retention incentives awarded from fiscal year 2014 through fiscal year 2016 found that they all generally incorporated the internal controls described by officials. The key controls in this process include: Application review at the institution and regional levels. According to BOP officials, the retention incentive application process begins with an institution’s human resources office, whose staff complete a retention incentive application on behalf of an employee. The institution’s human resources office verifies that the information in the application justifies a retention incentive and that funds are available to pay the incentive. Although it is not required, BOP officials said that they use a retention incentive application checklist to help institutions ensure that retention incentive applications are complete. The institution’s human resources office then submits the completed application packet, which includes supporting documentation, to the warden for review. Next, the application packet is forwarded to the respective BOP regional director who also reviews it for accuracy and completeness. The regional director then adds an approval statement and forwards the packet to the Central Office for final review and approval. Of the 40 randomly selected application packet case files that we reviewed, 36 included a retention incentive checklist used by the institutions and all contained information to justify the retention incentive as well as a statement of the regional director’s approval. Central Office’s final application approval. BOP policy requires that all retention incentive applications undergo two levels of review in BOP’s Central Office: first by the Human Resource Management Division’s (HRMD) Staffing and Employee Relations Section (SERS) and next by HRMD’s Personnel Director, for final review and approval. According to BOP officials, during the review process there is ongoing communication between the various entities to ensure that applications are complete and accurate; for example, if SERS finds an error in the application or requests additional information, SERS returns the application to the regional or institutional level for correction and re-review. All of the 40 BOP application packet case files that we reviewed included approvals by HRMD’s Personnel Director or an authorized official, as required by BOP policy. Annual review and re-certification to continue retention incentives. According to BOP policy, on an annual basis, institutions’ human resources offices are required to review employees’ retention incentives to determine whether the incentive is still warranted. Payment of a retention incentive may be recertified and continued as long as the conditions giving rise to the original determination to pay the incentive still exist and funds are available. For each retention incentive, an institution’s human resources office must determine whether to continue, adjust, or terminate the incentive within one year of the initial or most recent approval. If the human resources office decides to continue the retention incentive, the institution’s warden must again submit a retention incentive application. Applications to continue the retention incentive proceed through the same review and approval process as initial applications. Of the 40 application files that we reviewed, 29 were continuations and 8 were initial requests for a retention incentive. According to BOP officials, after the initial approval of a retention incentive, an institution’s human resources office has primary responsibility for the monitoring of retention incentive payments. According to officials, institutions use a variety of internal controls to monitor the expiration, continuation, or termination of retention incentives, for example: Monitoring expiration dates. BOP officials stated that institutions’ human resources offices monitor retention incentives in order to identify incentives that are approaching their expiration date and need to be terminated or renewed. For example, according to BOP officials from USP Atwater, FCC Butner and FCI Phoenix, staff from their institutions’ human resources offices may generate a retention incentive activity report and cross reference this report with their locally generated tracking sheets. This process helps identify retention incentives approaching their expiration dates so that the human resources offices can submit a request for continuation before the incentive expires. Using automated reminders to prompt file review. BOP officials stated that institutions use automated reminders to alert human resources staff to check the records of retention incentive recipients for human resources-related events such as promotions or relocations that could affect the continuation of a retention incentive. Following a checklist of steps for relocation processes. BOP officials told us that in April 2016 they instituted a checklist that outlines steps that an institution’s human resources staff must take when employees relocate to a different institution. Based on our review of this checklist, one step on the sheet prompts human resources staff to review the employee’s retention incentive. According to BOP policy, when an employee receiving a retention incentive transfers to another location, the human resources office where the employee was receiving the retention incentive is responsible for submitting a request to terminate the incentive. The termination must be effective the last day of the pay period that the employee occupies the position. Submitting forgiveness waivers. BOP officials told us that institutions submit forgiveness waivers if a request to continue a retention incentive is not submitted and approved prior to the retention incentive expiring. BOP officials said that a forgiveness waiver is considered an acknowledgement of an administrative error and is a late submission of a retention incentive renewal that was still warranted. The waiver is not a request to forgive an overpayment since the employee was still considered to be eligible for the retention incentive. Of the 40 retention incentive applications that we reviewed, 5 applications included forgiveness waivers to excuse the tardiness of the filing and request continuations of the retention incentive. According to BOP officials, BOP conducts periodic audits and reviews of its human capital activities and related internal controls, to ensure that retention incentives are being used appropriately. The following offices conduct various audits and reviews involving BOP’s retention incentives: BOP’s Program Review Division (PRD) audits regional and institutional human resources functions. PRD audits BOP’s regional and institutional human resources offices to ensure that they are in compliance with BOP policies and procedures. According to BOP officials, as part of the audit process, PRD audits retention incentives to ensure that they have the proper approvals and are justified. PRD audits each institution’s human resources office at least every three years. During these audits, PRD generates retention incentive activity reports (the same reports that institutions run when monitoring for expiration dates), to check the accuracy of retention incentive programs under review. Following each audit, PRD issues a final report with findings to the institution and to the staff operating the program area under audit. Institutions respond to the report with corrective actions that the institution will take to address the findings. When the institution has resolved all corrective actions from the audit, the audit is closed. Additionally, each quarter, PRD provides HRMD with a report that summarizes its quarterly audit findings. According to BOP officials, HRMD uses these reports to identify any agency-wide trends that need to be addressed. Our review of BOP data showed that between fiscal years 2012 and 2016, PRD conducted nearly 200 audits. For example, in the fourth quarter of fiscal year 2016, PRD audited five institutions’ and regional offices’ human resource management functions. During these audits, PRD identified nine deficiencies, one of which pertained to retention incentives. Specifically, it found that one audited institution did not terminate an employee’s retention incentive after the employee had relocated to another institution. To correct the deficiency, the institution cancelled the retention incentive which discontinued future disbursements. According to BOP officials, a bill was generated to recoup the overpayment from the employee. BOP institutions conduct annual operation reviews of internal functions, such as human resources. BOP officials told us that each institution conducts annual operational reviews of various internal functions, such as human resources. According to BOP’s Program Review Guidelines for Human Resource Servicing Offices, during these reviews, institutions are required to review supporting documentation for staff currently receiving an incentive to determine if the incentives are still warranted. If the initial request for the retention incentive was made over the preceding 12 months, institutions are also required to ensure that it was approved. According to BOP officials, the results of these reviews are reported to PRD through the Central Office. DOJ’s Justice Management Division (JMD) audits BOP’s human resources programs. According to BOP officials, JMD conducts audits of component-level human resources programs to determine whether BOP’s systems are compliant with DOJ policy and aligned with DOJ’s Human Capital Strategic Plan. JMD’s most recent audit of BOP’s human resources programs that included a review of BOP’s retention incentives occurred in September 2010 at BOP’s Human Resource Service Center in Grand Prairie, Texas. JMD found that in some cases BOP granted retention incentives prior to the signing of service agreements. JMD also found that BOP lacked documentation to authorize a group retention incentive for employees at its Victorville, California institution. BOP’s written response to the findings stated that JMD incorrectly applied the service agreement requirement, as service agreements were not warranted in the specific case that it identified. Additionally, BOP stated that the documents JMD identified as missing from the case files in question were kept in separate files and not required to be part of the retention incentive application. JMD agreed with BOP’s responses and in January 2013, JMD closed out the audit’s findings noting that these responses satisfied all required corrective actions. While BOP takes a number of steps to determine current workforce needs and how to fill those needs, BOP does not strategically plan for how retention incentives can be used to meet long-term human capital goals. BOP officials stated that planning for human capital needs is conducted at institutions during quarterly workforce utilization meetings or manpower salary meetings. During these meetings, executive staff at the institution discuss the current state of the institution’s workforce. According to the BOP officials, while considering attrition, hiring, and turnover rates, the executive staff decide strategies they will employ to attract and retain employees for their current needs. While officials we spoke with at four institutions have discussed retention incentives at their workforce utilization meetings, details about the content of these discussions ranged. According to these officials and our review of meeting minutes from the four institutions, discussions about retention incentives respond to each institution’s short-term staffing situation rather than address future staffing needs based on an overall strategic human capital plan. For example: USP Atwater officials told us that they review the current turnover rate, budget, projected vacancies, and use of retention incentives at annual budget development meetings. Meeting minutes reflected the following on retention incentives: “retention … still necessary to retain staff and hard-to-fill positions.” FCC Butner is a medical facility that offers retention incentives to all medical officers (all types of doctors) and nurses (practitioners, registered, etc.) at the institution. According to Butner officials, during workforce utilization meetings, Butner officials discuss recruitment and staffing trends for the institution and plans for how to address any staffing challenges. Meeting minutes we reviewed did not indicate specific discussions about the use of retention incentives. FCC Pollock executive staff discuss current institutional salary expenditures and projections and the status of vacant positions at workforce utilization meetings. While meeting minutes we reviewed indicated discussions about projected expenditures for incentive awards, the minutes did not differentiate between retention incentive awards, and other incentive awards such as recruitment or relocation incentive awards. FCI Phoenix officials stated that in their workforce utilization meetings, executive staff discuss salary projections and vacancy statuses. Meeting minutes we reviewed did not indicate specific discussions about the use of retention incentives. BOP decisions about retention incentives are currently not tied to any strategic human capital plan for how to use human capital flexibilities— such as retention incentives—to address their ongoing challenge of retaining staff in hard-to-fill positions. According to officials, retention incentives are awarded on an as-needed basis, determined by an institution’s warden, if funds are available. According to key principles for effective strategic human capital planning, such planning is an important component of an agency’s effort to develop long-term strategies for acquiring, developing, and retaining staff needed for an agency to achieve its goals. Specifically, senior leaders should be involved in developing, communicating, and implementing strategic human capital plans. Within an agency’s strategic human capital plan, the human capital policies, practices, and programs—for example, an agency’s retention incentive program—should clearly link to the human capital and program goals of the organization. By not having a strategic human capital plan that clearly establishes strategies that will be used to achieve specific human capital goals, BOP cannot ensure that its institutions are strategically managing their workforces in a manner that meets the agency’s human capital needs. In August 2017, BOP officials told us that they began drafting a strategic human capital operating plan that will include strategic objectives, action plans, performance objectives and measures, and evaluation/reporting requirements. Officials stated that the plan will also include planning regarding the use of human capital flexibilities, such as retention incentives. BOP officials told us that they anticipate that the strategic human capital operating plan will be a supplement to their workforce utilization meetings and that an agency-wide plan will provide a set of strategies for all institutions to consider. However, BOP could not provide documentation of the project beginning or whether it would include a strategic approach specific to retention incentives. Including retention incentives in BOP’s strategic human capital operating plan would create a roadmap for the agency and the institutions to use to move from being reactive in their current workforce needs—for example, awarding retention incentives on an ad hoc basis when funds are available—to being strategic in how retention incentives are used and to ensure that these and other flexibilities help the agency achieve its long-term workforce goals. From fiscal year 2012 through fiscal year 2016, BOP spent more than $59 million on retention incentives but has not established any measures to evaluate their effectiveness. According to officials, BOP has not evaluated the effectiveness of its use of retention incentives because BOP officials consider a retention incentive successful if an employee does not leave the agency. However, BOP also uses other human capital flexibilities along with retention incentives to help retain staff. For example, BOP uses physician and dental comparability allowances—additional pay to a physician or dentist who enters into an agreement for a specified period of service—to help retain these medical personnel. According to officials, it would otherwise be difficult to compete with private sector salaries without the use of all available incentives. However, BOP has not studied whether or how retention incentives have contributed to employees’ retention in relation to other incentives such as physician and dental comparability allowances. According to our work on strategic human capital management and OPM’s guidance, it is crucial for organizations to evaluate the success of their human capital strategies, such as the use of retention incentives. In measuring the performance of these strategies and their contribution to key programmatic results, agencies can make adjustments, if necessary. For example, agencies can use evaluation results to make targeted investments in certain human capital strategies—such as the use of retention incentives—creating a cycle of strategic workforce management, where evaluation informs planning, planning dictates strategies, and strategies are evaluated for effectiveness. While BOP uses retention incentives to address critical skills gaps—such as with medical professionals—evaluating the effectiveness of retention incentives would help BOP determine whether and how retention incentives, as well as other human capital flexibilities, contribute to an employee’s continued employment at BOP or if adjustments to BOP retention strategies must be made for improved results. BOP officials agreed that evaluating the effectiveness of retention incentives would help them be more strategic about their human capital needs and spending on incentives. By including and implementing such an evaluation in its upcoming strategic human capital operating plan, BOP could better determine if it is making maximum use of its funds to retain the necessary qualified personnel or if changes must be made to most effectively retain its staff. As the largest employer within DOJ with some staff working in remote locations and undesirable conditions, BOP relies on a number of available flexibilities, including retention incentives, to help retain its employees. However, BOP currently lacks a strategic approach for using and evaluating retention incentives to address human capital goals. Given BOP’s ongoing staffing challenges, for example, retaining staff in hard-to- fill medical positions, developing a plan that includes a thoughtful blueprint for using retention incentives could help BOP better anticipate and address staffing needs. Moreover, evaluating its use of retention incentives could help BOP determine whether these incentives are effective or whether adjustments are needed to better retain its employees. By using evaluation results to inform planning, and planning to inform how retention incentives are used, BOP would be better positioned to achieve its long-term human capital goals and address its critical staffing needs. We are making two recommendations to BOP: 1. The Director of BOP should include in the forthcoming strategic human capital operating plan, 1) human capital goals and 2) strategies on how human capital flexibilities—including retention incentives—will be used to meet these goals. (Recommendation 1) 2. The Director of BOP should evaluate the effectiveness of BOP’s use of retention incentives to help determine whether the incentives have helped BOP achieve its human capital goals or if adjustments in retention incentives are needed. (Recommendation 2) We requested comments on a draft of this report from DOJ. In an email received November 15, 2017, the DOJ liaison stated that DOJ concurred with our recommendations. The Department did not provide official written comments to include in our report, but did provide written technical comments, which we incorporated as appropriate. As agreed with your office, unless you publicly announce the contents of this report earlier, we plan no further distribution until 30 days from the report date. At that time, we will send copies of this report to the Attorney General and the Director of BOP. In addition, the report is available at no charge on the GAO website at http://www.gao.gov. If you or your staff have any questions about this report, please contact me at (202) 512-9627 or maurerd@gao.gov. Contact points for our Offices of Congressional Relations and Public Affairs may be found on the last page of this report. GAO staff who made key contributions to this report are listed in Appendix III. This report examines (1) how BOP has used its authority to pay retention incentives; (2) what internal controls are in place for the use of retention incentives; and (3) the extent to which BOP plans for and evaluates the use of retention incentives. To determine how BOP has used its authority to pay retention incentives, we reviewed BOP’s July 2012 report on its use of recruitment, relocation, and retention (3R) incentives. We then obtained underlying retention incentive expenditure data from DOJ’s Justice Management Division because it serves as the focal point for performance and financial information for all Department of Justice components and employees, including BOP. In particular, we obtained employee-level retention incentive payroll data for fiscal years 2012 through 2016. We selected this time period because it includes the most recent five complete fiscal years for which data were available and because we believe five years is sufficient time to identify trends in BOP’s retention incentive expenditures. We analyzed and aggregated the employee-level data by institution, occupation, and employee grade level. To identify trends, we compared per fiscal year expenditures across the various categories of occupations and locations across the five years. Additionally, we categorized institutions by BOP region, institutions that use group retention incentives, and institutions that use individual retention incentives. We also categorized occupations as medical professionals, correctional officers, and all other occupations and compared aggregate retention incentive expenditures for the different groups. Using information from BOP’s website and testimonial evidence from BOP officials on its health care system, for the purposes of this report, we defined medical professionals as BOP employees in occupations that provide medical, dental, and mental health care services and who do not solely provide these services in an administrative function. For the purposes of our analyses, medical professionals are dentists, dental assistants and hygienists, diagnostic radiological technologists, health aid and technicians, medical doctors (including psychiatrists), medical technologists, nurses, pharmacists, pharmacy technicians, physician assistants, and practical nurses and psychologists. To assess the employee-level retention incentive payroll data’s reliability, we obtained and analyzed documentation on systems’ capabilities and data control, interviewed data users and managers responsible for maintaining data, conducted checks for completeness and logical consistency, and compared the employee-level data to aggregated institution-level retention incentive expenditure data from BOP’s Financial Management Information System. We found the employee-level data to be sufficiently reliable for the purpose of this report. Additionally for this objective, we reviewed documents such as the DOJ’s Financial Management Information System Sub-Object Classification Code Guide and the Office of Personnel Management (OPM) Handbook of Occupational Groups and Families to respectively identify the system codes used to track retention incentives expenditures and to identify the names for each occupational series code in the datasets. We also interviewed BOP Human Resource Management headquarters officials to obtain information on the primary purposes for BOP’s use of retention incentives and their views on identified retention incentive expenditures trends. We also interviewed U.S. Department of Health and Human Services’ (HHS) Public Health Service (PHS) officials to better understand how BOP and PHS manage costs, including retention incentive expenditures, for PHS staff assigned to BOP. BOP partners with PHS to acquire medical staff to provide medical care for BOP’s inmate population. BOP reimburses PHS for the costs of compensation and benefits—including retention incentive payments, if applicable—for PHS staff assigned to BOP. PHS has final approval authority for retention incentives paid to PHS staff assigned to BOP facilities. Furthermore, we obtained aggregated retention incentive expenditure data from PHS on the total amount of funds BOP reimbursed PHS for fiscal years 2012 through 2016. For the reliability of PHS’s data, we reviewed the system’s data fields to check that the appropriate fields were used to provide data and interviewed data users and managers to discuss how expenditures are recorded and maintained. We found the PHS data to be sufficiently reliable for the purpose of this report. To identify and describe the internal controls that BOP has in place related to retention incentives, we obtained and analyzed documentation regarding BOP requirements and guidance for the use of retention incentives. We also interviewed officials from BOP’s Central Office who are responsible for the administration, management, and oversight of BOP’s human capital management systems, including retention incentives. We focused on the management and administrative controls used by BOP to review, approve, re-certify, and monitor retention incentives. Additionally, we interviewed the warden and human capital officers at 4 of the 122 institutions to obtain illustrative examples regarding the internal controls in place at these institutions to ensure the proper disbursement of retention incentives. We interviewed BOP officials at Federal Correctional Complex Pollock in Pollock, LA; Federal Correctional Complex Butner in Butner, NC; United States Penitentiary, Atwater in Atwater, CA and Federal Correctional Institution Phoenix, in Phoenix, AZ. These institutions were selected to ensure variation in the number and types of employees receiving retention incentives, BOP region, and security-level. Although the information we obtained from the interviews with officials at these four institutions cannot be generalized to other BOP institutions, these interviews provided important insights and perspectives about the use of retention incentives at BOP institutions. We also reviewed a non-generalizable random sample of 40 retention incentive application packet case files to determine the extent to which these files contained documentation on the internal control activities in place to monitor the application, approval, and funds disbursement processes of BOP’s retention incentive program. To identify our sample, we used employee-level expenditure data to randomly select 40 application files from the universe of BOP employees who received retention incentives from fiscal years 2014 through 2016. Each application file was reviewed by two GAO analysts who each assessed the extent to which each application contained the appropriate justification, approval signatures, and other documentation such as an application checklist and whether the application was an initial or continuation application. To determine the extent to which BOP plans for and evaluates the use of retention incentives, we interviewed BOP officials regarding their experiences with retention incentives, how they use retention incentives to strategically manage their workforce needs, how the agency evaluates the effectiveness of retention incentives, and how retention incentives contribute to BOP’s broader human capital goals. We then compared these efforts to our work on strategic human capital planning, specifically in terms of planning for and evaluating the use of human capital flexibilities. Additionally, we interviewed the warden and human capital officers at four BOP institutions mentioned above to obtain illustrative examples of how workforce planning occurs at these institutions. We also reviewed the DOJ’s Office of Inspector General Report 16-02 “Review of the Federal Bureau of Prisons’ Medical Staffing Challenges” (March 2016) and our past work to better understand the challenges that BOP faces in retaining medical professionals and other staff. Table 2 provides the Bureau of Prisons’ (BOP) fiscal year 2016 retention incentive expenditures by various occupations and groups of occupations, such as medical professionals, correctional officers, and other occupations. A range of occupations are reflected in the table primarily as a result of four California institutions—United States Penitentiary (USP) Atwater, Federal Correctional Institution (FCI) Herlong, FCI Mendota, and Federal Correctional Complex Victorville—providing retention incentives to all employees at General Schedule grades level 12 and below and those in the Federal Wage System. In addition to the contact named above, Dawn Locke (Assistant Director) and Meghan Squires (Analyst-in-Charge) managed the work. Also, David Alexander, Renee Caputo, Willie Commons III, Jamarla Edwards, Robert Goldenkoff, Chelsa Gurkin, Eric Hauswirth, Janice Latimer, Lerone Reid, Rachel Stoiko, and Adam Vogt made significant contributions to this report. + +Now, write a one-page summary of the report. + +Summary: \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_4k.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_4k.txt new file mode 100644 index 00000000000..64e1923bc3f --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/continuation/longbench_4k.txt @@ -0,0 +1,760 @@ +Please complete the code given below. +app/src/main/java/com/matejdro/pebbledialer/callactions/EndCallAction.java +public class EndCallAction extends CallAction +{ + public static final int END_CALL_ACTION_ID = 1; + + private PendingIntent notificationEndCallIntent; + private static Method getITelephonyMethod; + + public EndCallAction(CallModule callModule) + { + super(callModule); + + try { + getITelephonyMethod = TelephonyManager.class.getDeclaredMethod("getITelephony", (Class[]) null); + getITelephonyMethod.setAccessible(true); + } catch (NoSuchMethodException e) { + Timber.e(e, "iTelephony end not supported on your phone!"); + } catch (Exception e) { + Timber.e(e, "Error while acquiring iTelephony"); + Crashlytics.logException(e); + } + + } + + public void registerNotificationEndCallIntent(PendingIntent notificationAnswerIntent) + { + this.notificationEndCallIntent = notificationAnswerIntent; + } + + @Override + public void executeAction() + { + getCallModule().setCloseAutomaticallyAfterThisCall(true); + + if (getCallModule().getService().getGlobalSettings().getBoolean("rootMode", false)) + { + Timber.d("Ending call using root method..."); + try { + Runtime.getRuntime().exec(new String[] {"su", "-c", "input keyevent 6"}); + return; + } catch (IOException e) { + e.printStackTrace(); + } + } + + if (getCallModule().getCallState() == CallModule.CallState.RINGING && notificationEndCallIntent != null) + { + Timber.d("Ending call using notification method..."); + + try { + notificationEndCallIntent.send(); + return; + } catch (PendingIntent.CanceledException e) { + } + } + + if (getITelephonyMethod != null) + { + Timber.d("Ending call using generic iTelephony method..."); + try + { + ITelephony iTelephony = (ITelephony) getITelephonyMethod.invoke(getCallModule().getService().getSystemService(Context.TELEPHONY_SERVICE), (Object[]) null); + iTelephony.endCall(); + return; + } + catch (SecurityException e) + { + Timber.e("Cannot decline call, no CALL_PHONE permission."); + } + catch (Exception e) { + Timber.e(e, "Error while invoking iTelephony.endCall()"); + Crashlytics.logException(e); + } + } + + Timber.e("All end call options failed! Nothing is supported."); + } + + @Override + public void onCallEnd() + { + notificationEndCallIntent = null; //Reset intent (there will be new intent at next call) + } + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_END_CALL; + } + + public static EndCallAction get(CallModule callModule) + { + return (EndCallAction) callModule.getCallAction(END_CALL_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/AnswerCallAction.java +public class AnswerCallAction extends CallAction +{ + public static final int ANSWER_ACTION_ID = 0; + + private PendingIntent notificationAnswerIntent; + + public AnswerCallAction(CallModule callModule) + { + super(callModule); + } + + public void registerNotificationAnswerIntent(PendingIntent notificationAnswerIntent) + { + this.notificationAnswerIntent = notificationAnswerIntent; + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.RINGING) + return; + + if (getCallModule().getService().getGlobalSettings().getBoolean("rootMode", false)) + { + Timber.d("Answering using root method..."); + try { + Runtime.getRuntime().exec(new String[] {"su", "-c", "input keyevent 5"}); + return; + } catch (IOException e) { + e.printStackTrace(); + } + + } + + if (notificationAnswerIntent != null) + { + Timber.d("Answering using notification method..."); + + try { + notificationAnswerIntent.send(); + return; + } catch (PendingIntent.CanceledException e) { + } + } + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) + { + answerNativelyOreo(); + } + else if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.LOLLIPOP) + { + answerUsingMediaServer(); + } + else + { + Timber.d("Answering using generic headset hook method..."); + Intent buttonUp = new Intent(Intent.ACTION_MEDIA_BUTTON); + buttonUp.putExtra(Intent.EXTRA_KEY_EVENT, new KeyEvent(KeyEvent.ACTION_UP, KeyEvent.KEYCODE_HEADSETHOOK)); + getCallModule().getService().sendOrderedBroadcast(buttonUp, "android.permission.CALL_PRIVILEGED"); + } + + } + + @TargetApi(Build.VERSION_CODES.O) + private void answerNativelyOreo() { + TelecomManager telecomManager + = (TelecomManager) getCallModule().getService().getSystemService(Context.TELECOM_SERVICE); + + Timber.d("Answering natively with Oreo."); + + try { + telecomManager.acceptRingingCall(); + } catch (SecurityException e) { + Timber.e("No accept call permission!"); + } + } + + @TargetApi(Build.VERSION_CODES.LOLLIPOP) + private void answerUsingMediaServer() + { + Timber.d("Answering using media server method..."); + + MediaSessionManager mediaSessionManager = (MediaSessionManager) getCallModule().getService().getSystemService(Context.MEDIA_SESSION_SERVICE); + + try { + List mediaControllerList = mediaSessionManager.getActiveSessions + (new ComponentName(getCallModule().getService(), JellybeanNotificationListener.class)); + + for (MediaController m : mediaControllerList) { + if ("com.android.server.telecom".equals(m.getPackageName())) { + Timber.d("Found telephony media controller!"); + m.dispatchMediaButtonEvent(new KeyEvent(KeyEvent.ACTION_UP, KeyEvent.KEYCODE_HEADSETHOOK)); + break; + } + } + } catch (SecurityException e) { + Timber.e("Notification service not running!"); + } + } + + @Override + public void onCallEnd() + { + notificationAnswerIntent = null; //Reset intent (there will be new intent at next call) + } + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_ANSWER; + } + + public static AnswerCallAction get(CallModule callModule) + { + return (AnswerCallAction) callModule.getCallAction(ANSWER_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/SMSReplyAction.java +public class SMSReplyAction extends CallAction +{ + public static final int SMS_REPLY_ACTION_ID = 6; + + public SMSReplyAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + ToggleRingerAction toggleRingerAction = ToggleRingerAction.get(getCallModule()); + toggleRingerAction.mute(); + + SMSReplyModule smsReplyModule = SMSReplyModule.get(getCallModule().getService()); + smsReplyModule.startSMSProcess(getCallModule().getNumber()); + + getCallModule().setCloseAutomaticallyAfterThisCall(false); + } + + @Override + public void onCallEnd() + { + } + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_END_CALL; + } + + public static SMSReplyAction get(CallModule callModule) + { + return (SMSReplyAction) callModule.getCallAction(SMS_REPLY_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/ToggleMicrophoneAction.java +public class ToggleMicrophoneAction extends CallAction +{ + public static final int TOGGLE_MICROPHONE_ACTION_ID = 3; + + private boolean microphoneMuted = false; + + public ToggleMicrophoneAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.ESTABLISHED) + return; + + microphoneMuted = !microphoneMuted; + + if (getCallModule().getService().getGlobalSettings().getBoolean("rootMode", false)) + { + String muteCommand; + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.LOLLIPOP) + muteCommand = "input keyevent 79"; + else + muteCommand = "input keyevent 91"; + try { + Runtime.getRuntime().exec(new String[] {"su", "-c", muteCommand}); + } catch (IOException e) { + e.printStackTrace(); + } + } + else + { + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + audioManager.setMicrophoneMute(microphoneMuted); + } + + getCallModule().updatePebble(); + } + + @Override + public int getIcon() + { + return microphoneMuted ? CallAction.ICON_BUTTON_MIC_OFF : CallAction.ICON_BUTTON_MIC_ON; + } + + public static ToggleMicrophoneAction get(CallModule callModule) + { + return (ToggleMicrophoneAction) callModule.getCallAction(TOGGLE_MICROPHONE_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/ToggleRingerAction.java +public class ToggleRingerAction extends CallAction +{ + public static final int TOGGLE_RINGER_ACTION_ID = 2; + + private boolean isMutedViaAudioManager = false; + private int prevRingerMode = AudioManager.RINGER_MODE_NORMAL; + + public ToggleRingerAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.RINGING) + return; + + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + + getCallModule().setVibration(false); + + if (!isMutedViaAudioManager) + { + if (getCallModule().getService().getGlobalSettings().getBoolean("rootMode", false)) + { + Timber.d("Muting using root method..."); + try { + Runtime.getRuntime().exec(new String[] {"su", "-c", "input keyevent " + KeyEvent.KEYCODE_VOLUME_DOWN}); + } catch (IOException e) { + e.printStackTrace(); + } + + } + else if (canMuteRinger(getCallModule().getService())) + { + isMutedViaAudioManager = true; + prevRingerMode = audioManager.getRingerMode(); + + audioManager.setStreamSolo(AudioManager.STREAM_MUSIC, true); + audioManager.setRingerMode(AudioManager.RINGER_MODE_SILENT); + } + } + else if (canMuteRinger(getCallModule().getService())) + { + isMutedViaAudioManager = false; + audioManager.setStreamSolo(AudioManager.STREAM_MUSIC, false); + audioManager.setRingerMode(prevRingerMode); + } + + getCallModule().updatePebble(); + } + + public void mute() + { + if (!isMutedViaAudioManager) + executeAction(); + } + + public static boolean canMuteRinger(Context context) + { + if (Build.VERSION.SDK_INT < Build.VERSION_CODES.M) + return true; + + NotificationManager notificationManager = (NotificationManager) context.getSystemService(Context.NOTIFICATION_SERVICE); + return notificationManager.isNotificationPolicyAccessGranted(); + } + + @Override + public void onCallEnd() + { if (isMutedViaAudioManager && canMuteRinger(getCallModule().getService())) + { + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + isMutedViaAudioManager = false; + audioManager.setStreamSolo(AudioManager.STREAM_MUSIC, false); + audioManager.setRingerMode(prevRingerMode); + } + + getCallModule().setVibration(true); + } + + @Override + public int getIcon() + { + return isMutedViaAudioManager ? CallAction.ICON_BUTTON_SPEAKER_OFF : CallAction.ICON_BUTTON_SPEKAER_ON; + } + + public static ToggleRingerAction get(CallModule callModule) + { + return (ToggleRingerAction) callModule.getCallAction(TOGGLE_RINGER_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/AnswerCallWithSpeakerAction.java +public class AnswerCallWithSpeakerAction extends CallAction +{ + public static final int ANSWER_WITH_SPEAKER_ACTION_ID = 5; + + private boolean enableSpeakerImmediately = false; + + public AnswerCallWithSpeakerAction(CallModule callModule) + { + super(callModule); + } + + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.RINGING) + return; + + enableSpeakerImmediately = true; + AnswerCallAction.get(getCallModule()).executeAction(); + } + + @Override + public void onCallEnd() + { + enableSpeakerImmediately = false; //Reset intent (there will be new intent at next call) + } + + @Override + public void onPhoneOffhook() + { + if (enableSpeakerImmediately) + { + ToggleSpeakerAction speakerAction = ToggleSpeakerAction.get(getCallModule()); + + if (!speakerAction.isSpeakerphoneEnabled()) + speakerAction.executeAction(); + } + } + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_ANSWER; + } + + public static AnswerCallWithSpeakerAction get(CallModule callModule) + { + return (AnswerCallWithSpeakerAction) callModule.getCallAction(ANSWER_WITH_SPEAKER_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/VolumeDownAction.java +public class VolumeDownAction extends CallAction +{ + public static final int VOLUME_DOWN_ACTION_ID = 7; + + public VolumeDownAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.ESTABLISHED) + return; + + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + audioManager.adjustStreamVolume(AudioManager.STREAM_VOICE_CALL, AudioManager.ADJUST_LOWER, 0); + } + + + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_VOLUME_DOWN; + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/CallAction.java +public abstract class CallAction +{ + public static final int ICON_BUTTON_ANSWER = 0; + public static final int ICON_BUTTON_END_CALL = 1; + public static final int ICON_BUTTON_MIC_ON = 2; + public static final int ICON_BUTTON_MIC_OFF = 3; + public static final int ICON_BUTTON_SPEKAER_ON = 4; + public static final int ICON_BUTTON_SPEAKER_OFF = 5; + public static final int ICON_BUTTON_VOLUME_DOWN = 6; + public static final int ICON_BUTTON_VOLUME_UP = 7; + public static final int ICON_BLANK = 0xFF; + + private CallModule callModule; + + public CallAction(CallModule callModule) + { + this.callModule = callModule; + } + + public CallModule getCallModule() + { + return callModule; + } + + public void onPhoneOffhook() + { + + } + + public void onCallRinging() + { + + } + + public void onCallEnd() + { + + } + + public abstract void executeAction(); + public abstract int getIcon(); +} +app/src/main/java/com/matejdro/pebbledialer/notifications/JellybeanNotificationListener.java +@TargetApi(value = Build.VERSION_CODES.JELLY_BEAN_MR2) +public class JellybeanNotificationListener extends NotificationListenerService { + private static JellybeanNotificationListener instance = null; + + @Override + public void onDestroy() { + Timber.d("Notification Listener stopped..."); + super.onDestroy(); + + instance = null; + } + + @Override + public void onCreate() { + Timber.d("Creating Notification Listener..."); + super.onCreate(); + + instance = this; + } + + public static boolean isActive() + { + return instance != null; + } + + @TargetApi(value = Build.VERSION_CODES.LOLLIPOP) + public static boolean isPhoneInDoNotInterrupt() + { + if (instance == null) + return false; + + int interruptionFilter = instance.getCurrentInterruptionFilter(); + Timber.d("Interrupt filter: %d", interruptionFilter); + return interruptionFilter != NotificationListenerService.INTERRUPTION_FILTER_ALL && interruptionFilter != 0; + } + + @Override + public void onNotificationPosted(final StatusBarNotification sbn) { + Timber.d("Got new jellybean notification"); + NotificationHandler.newNotification(JellybeanNotificationListener.this, sbn.getPackageName(), sbn.getNotification()); + + + } + + @Override + public void onNotificationRemoved(StatusBarNotification sbn) { + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/ToggleSpeakerAction.java +public class ToggleSpeakerAction extends CallAction +{ + public static final int TOGGLE_SPEAKER_ACTION_ID = 4; + + private boolean speakerphoneEnabled = false; + + public ToggleSpeakerAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.ESTABLISHED) + return; + + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + + speakerphoneEnabled = !speakerphoneEnabled; + audioManager.setSpeakerphoneOn(speakerphoneEnabled); + + getCallModule().updatePebble(); + } + + public boolean isSpeakerphoneEnabled() + { + return speakerphoneEnabled; + } + + private void updateSpeakerphoneEnabled() + { + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + speakerphoneEnabled = audioManager.isSpeakerphoneOn(); + } + + @Override + public void onPhoneOffhook() + { + updateSpeakerphoneEnabled(); + } + + @Override + public int getIcon() + { + return speakerphoneEnabled ? ICON_BUTTON_SPEKAER_ON : ICON_BUTTON_SPEAKER_OFF; + } + + public static ToggleSpeakerAction get(CallModule callModule) + { + return (ToggleSpeakerAction) callModule.getCallAction(TOGGLE_SPEAKER_ACTION_ID); + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/VolumeUpAction.java +public class VolumeUpAction extends CallAction +{ + public static final int VOLUME_UP_ACTION_ID = 8; + + public VolumeUpAction(CallModule callModule) + { + super(callModule); + } + + @Override + public void executeAction() + { + if (getCallModule().getCallState() != CallModule.CallState.ESTABLISHED) + return; + + AudioManager audioManager = (AudioManager) getCallModule().getService().getSystemService(Context.AUDIO_SERVICE); + audioManager.adjustStreamVolume(AudioManager.STREAM_VOICE_CALL, AudioManager.ADJUST_RAISE, 0); + } + + + + @Override + public int getIcon() + { + return CallAction.ICON_BUTTON_VOLUME_UP; + } +} +app/src/main/java/com/matejdro/pebbledialer/callactions/DummyAction.java +public class DummyAction extends CallAction +{ + public static final int DUMMY_ACTION_ID = 999; + + + public DummyAction(CallModule callModule) + { + super(callModule); + + } + + @Override + public void executeAction() + { + } + + @Override + public int getIcon() + { + return CallAction.ICON_BLANK; + } + + public static DummyAction get(CallModule callModule) + { + return (DummyAction) callModule.getCallAction(DUMMY_ACTION_ID); + } +} +package com.matejdro.pebbledialer.modules; +import android.app.PendingIntent; +import android.content.Intent; +import android.content.SharedPreferences; +import android.database.Cursor; +import android.graphics.Bitmap; +import android.net.Uri; +import android.os.Build; +import android.provider.ContactsContract; +import android.provider.MediaStore; +import android.service.notification.NotificationListenerService; +import android.telephony.TelephonyManager; +import android.util.SparseArray; +import com.getpebble.android.kit.util.PebbleDictionary; +import com.matejdro.pebblecommons.pebble.CommModule; +import com.matejdro.pebblecommons.pebble.PebbleCommunication; +import com.matejdro.pebblecommons.pebble.PebbleImageToolkit; +import com.matejdro.pebblecommons.pebble.PebbleTalkerService; +import com.matejdro.pebblecommons.pebble.PebbleUtil; +import com.matejdro.pebblecommons.util.ContactUtils; +import com.matejdro.pebblecommons.util.Size; +import com.matejdro.pebblecommons.util.TextUtil; +import com.matejdro.pebblecommons.vibration.PebbleVibrationPattern; +import com.matejdro.pebbledialer.callactions.AnswerCallAction; +import com.matejdro.pebbledialer.callactions.AnswerCallWithSpeakerAction; +import com.matejdro.pebbledialer.callactions.CallAction; +import com.matejdro.pebbledialer.callactions.DummyAction; +import com.matejdro.pebbledialer.callactions.EndCallAction; +import com.matejdro.pebbledialer.callactions.SMSReplyAction; +import com.matejdro.pebbledialer.callactions.ToggleMicrophoneAction; +import com.matejdro.pebbledialer.callactions.ToggleRingerAction; +import com.matejdro.pebbledialer.callactions.ToggleSpeakerAction; +import com.matejdro.pebbledialer.callactions.VolumeDownAction; +import com.matejdro.pebbledialer.callactions.VolumeUpAction; +import com.matejdro.pebbledialer.notifications.JellybeanNotificationListener; +import java.io.IOException; +import java.util.Calendar; +import java.util.List; +import timber.log.Timber; + + + + + +public class CallModule extends CommModule +{ + public static final String INTENT_CALL_STATUS = "CallStatus"; + public static final String INTENT_ACTION_FROM_NOTIFICATION = "ActionFromNotification"; + + public static int MODULE_CALL = 1; + + private SparseArray actions = new SparseArray(); + + private boolean updateRequired; + private boolean identityUpdateRequired; + private boolean callerNameUpdateRequired; + private int callerImageNextByte = -1; + + private String number = "Outgoing Call"; + private String name = null; + private String type = null; + private Bitmap callerImage = null; + private byte[] callerImageBytes; + + private CallState callState = CallState.NO_CALL; + + private boolean vibrating; + private boolean closeAutomaticallyAfterThisCall = true; + + long callStartTime; + + public CallModule(PebbleTalkerService service) + { + super(service); + + service.registerIntent(INTENT_CALL_STATUS, this); + service.registerIntent(INTENT_ACTION_FROM_NOTIFICATION, this); + + registerCallAction(new AnswerCallAction(this), AnswerCallAction.ANSWER_ACTION_ID); + registerCallAction(new EndCallAction(this), EndCallAction.END_CALL_ACTION_ID);Next line of code: diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_39.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_39.txt new file mode 100644 index 00000000000..7ab63eb828d --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_39.txt @@ -0,0 +1 @@ +(CNN)Mark Ronson's "Uptown Funk!," featuring Bruno Mars, is the longest-leading Billboard Hot 100 of the 2010s, ruling the chart for a 13th week. It's also just the 10th single in the Hot 100's entire history to spend at least 13 weeks at No. 1. Plus, newcomer Natalie La Rose reaches the top 10 with her debut hit "Somebody," featuring Jeremih. As we do each Wednesday, let's run down all the songs in the top 10, and a bit beyond, on the sales/airplay/streaming-based Hot 100 (dated April 11). "Funk," released on RCA Records, passes Robin Thicke's "Blurred Lines," featuring T.I. and Pharrell to take sole possession of the Hot 100's longest command this decade. Here's an updated look at the hits to lead for the most weeks since the beginning of 2010: . Weeks at No. 1, Title, Artist, Date Reached No. 1 . 13 (to date), "Uptown Funk!," Ronson feat. Mars, Jan. 17, 2015 . 12, "Blurred Lines," Robin Thicke feat. T.I. + Pharrell, June 22, 2013 . 10, "Happy," Pharrell Williams, March 8, 2014 . 10, "We Found Love," Rihanna feat. Calvin Harris, Nov. 12, 2011 . "Funk" also becomes one of an elite 10 singles ever to top the Hot 100 for at least 13 weeks, dating to the chart's Aug. 4, 1958 launch: . Weeks at No. 1, Title, Artist, Date Reached No. 1 . 16, "One Sweet Day," Mariah Carey & Boyz II Men, Dec. 2, 1995 . 14, "I Gotta Feeling," The Black Eyed Peas, July 11, 2009 . 14, "We Belong Together," Mariah Carey, June 4, 2005 . 14, "Candle in the Wind 1997"/"Something About the Way You Look Tonight," Elton John, Oct. 11, 1997 . 14, "Macarena (Bayside Boys Mix)," Los Del Rio, Aug. 3, 1996 . 14, "I'll Make Love to You," Boyz II Men, Aug. 27, 1994 . 14, "I Will Always Love You," Whitney Houston, Nov. 28, 1992 . 13 (to date), "Uptown Funk!," Ronson feat. Mars, Jan. 17, 2015 . 13, "The Boy Is Mine," Brandy & Monica, June 6, 1998 . 13, "End of the Road," Boyz II Men, Aug. 15, 1992 . Ask Billboard: Will 'Uptown Funk!' be the Hot 100's No. 1 Song of 2015? With "Funk" now just three weeks from potentially tying "One Sweet Day" for the record, and four weeks from possibly claiming it all to itself, can it rewrite Hot 100 history? It's too early to forecast charts a month away, but "Funk" still sports strong leads in all main Hot 100 metrics. "Funk" logs a 13th week atop the Digital Songs chart with 165,000 downloads sold (down 12 percent) in the week ending March 29, according to Nielsen Music. That's a record-tying feat: "Funk!" matches Flo Rida's 2007-08 hit "Low," featuring T-Pain, for the most weeks a title has spent at No. 1 on Digital Songs. "Funk" also leads Streaming Songs (16.2 million U.S. streams, down 15 percent) for an 11th week. On Radio Songs, "Funk" reigns for a 10th week with 166 million in all-format audience (down 4 percent). It's the first song to reach double-digit weeks at No. 1 on Radio Songs since "Blurred Lines" led for 11. Ask Billboard: Will 'Uptown Funk!' Be the Hot 100's No. 1 Song of 2015? "Funk," thus, leads the Hot 100 and its three main component charts (Digital Songs, Radio Songs and Streaming Songs) simultaneously for a record-extending ninth week (nonconsecutively). Perhaps helping the chances that "Funk" can remain at No. 1 on the Hot 100, at least for another week: while it's down by 11 percent in overall activity, the No. 2 song (for a third week), Maroon 5's "Sugar," decreases by 3 percent, while Ed Sheeran's "Thinking Out Loud," at No. 3 (for a third week, after peaking at No. 2 for eight weeks), is off by 2 percent. And, the lead of "Funk" over those songs is still significant: they each boast approximately two-thirds of the Hot 100 points of "Funk" this week. Could either "Sugar" or "Loud" rebound to challenge "Funk" further on the Hot 100? Could another song in the top 10 topple it? Or, is it a song just building, or not even yet released, that will take over? Again, it's too soon to tell. We know only that a song will eventually dethrone the uncommonly overarching smash that "Funk" has become. (At least we think one will ...) Chart Highlights: Taylor Swift's 'Style' hits No. 1 on adult pop songs . Meanwhile, "Sugar" takes over at No. 1 on the subscription services-based On-Demand Songs chart, despite a 10 percent drop to 4.1 million streams. ("Funk" falls to No. 3 on the list after 11 weeks at No. 1.) "Sugar" holds at No. 2 on Digital Songs (143,000, down 8 percent); rises 4-2 on Radio Songs (133 million, up 3 percent); and keeps at No. 4 on Streaming Songs (9.5 million, down 4 percent). Below Sheeran, Ellie Goulding's "Love Me Like You Do" holds at No. 4 on the Hot 100 after reaching No. 3. The Fifty Shades of Grey soundtrack single dips 3-4 on Digital Songs (114,000, down 14 percent) and stays at No. 5 on Radio Songs (118 million, up 8 percent) and Streaming Songs (9 million, up 9 percent). From the same hit movie, The Weeknd's "Earned It (Fifty Shades of Grey)" reaches the Hot 100's top five (6-5), adding top Airplay Gainer honors for a second week. On Radio Songs, it pushes 9-6 with a 23 percent gain to 86 million. "Earned" (a possible contender for No. 1 on the Hot 100 ...) holds at No. 6 on both Streaming Songs (8.9 million, up 24 percent) and Digital Songs (107,000, up 4 percent). The sultry track also takes over at No. 1 on Billboard's Hot R&B/Hip-Hop Songs chart. Chart Highlights: Taylor Swift's 'Style' Hits No. 1 on Adult Pop Songs . Fetty Wap's "Trap Queen" rises 8-6 on the Hot 100, while spending a second week at No. 1 on Hot Rap Songs; Taylor Swift's "Style" ranks at No. 7 on the Hot 100 for a third week after reaching No. 6 (and, as previously reported, reaches No. 1 on the Adult Pop Songs airplay chart); Rihanna, Kanye West and Paul McCartney's "FourFiveSeconds" drops 5-8 on the Hot 100 after climbing to No. 4 (and departs the Hot R&B/Hip-Hop Songs summit after seven weeks); and Flo Rida climbs 10-9 with "G.D.F.R.," featuring Sage the Gemini and Lookas. The rapper's new EP, My House, arrives Tuesday (April 7). One song is new to the Hot 100's top 10: La Rose's "Somebody," featuring Jeremih (13-10). The Dutch singer's debut hit lifts 10-8 on Radio Songs (73 million, up 9 percent); backtracks 13-14 on Digital Songs, but with a 7 percent gain to 68,000; and zooms 31-19 on Streaming Songs (4.3 million, up 5 percent). The track tops the Rhythmic Songs airplay chart for a second week. (Jeremih scores his fourth Hot 100 top 10, and first in a featured role.) La Rose is adjacent to her friend, and mentor, Flo Rida, on the Hot 100. After she had introduced herself to him at a party, they soon began working together, and she started touring with him. They created "Somebody," based on Whitney Houston's 1987 Hot 100 No. 1 "I Wanna Dance With Somebody (Who Loves Me)," as La Rose is a "huge fan of '80s music," as she told Billboard. She's currently recording her debut EP. Just beyond the Hot 100's top 10, Walk the Moon's "Shut Up and Dance" pushes 15-12, and is the new No. 1 on the Hot Rock Songs chart, while Jason Derulo's "Want to Want Me" bounds 27-17. And, Rihanna roars in at No. 23 with "B**** Better Have My Money," the chart's highest debut, powered largely by its No. 5 debut on Digital Songs (108,000 sold since its digital retail arrival on March 26). More details on action below the top 10 in the weekly "Hot 100 Chart Moves" column to post on Friday (April 3). See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved. \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index bb5acf261bf..0919d1349c7 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -22,6 +22,7 @@ import traceback import threading import csv +import warnings import numpy as np from datetime import date @@ -29,6 +30,8 @@ import os current_dir = os.path.dirname(os.path.realpath(__file__)) import sys +sys.stdout.reconfigure(encoding='utf-8') + from ipex_llm.utils import BenchmarkWrapper from ipex_llm.utils.common.log4Error import invalidInputError from ipex_llm.utils.common import invalidInputError @@ -46,6 +49,8 @@ QWENVL_IDS = ['Qwen/Qwen-VL-Chat'] +MINICPM_V_IDS = ['openbmb/MiniCPM-V-2_6', 'openbmb/MiniCPM-Llama3-V-2_5'] + results = [] excludes = [] @@ -53,8 +58,8 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in for i in range(num_trials + warm_up): st = time.perf_counter() if lookahead: - output_ids = model.generate(input_ids, lookahead=3, do_sample=False, max_matching_ngram_size=2, max_new_tokens=out_len, - min_new_tokens=out_len, num_beams=num_beams) + output_ids = model.generate(input_ids, lookahead=2, do_sample=False, max_matching_ngram_size=2, max_new_tokens=out_len, + min_new_tokens=out_len, num_beams=num_beams) else: output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, min_new_tokens=out_len, num_beams=num_beams) @@ -67,38 +72,46 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in torch.xpu.empty_cache() actual_out_len = output_ids.shape[1] - actual_in_len if i >= warm_up: - if lookahead: - result[in_out].append([model.first_token_time, (end - st - model.first_token_time)/model.n_token_generated, 0, + if lookahead or os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) == "1": + result[in_out].append([model.first_token_time, (end - st - model.first_token_time)/(model.n_token_generated - 1), 0, actual_in_len, actual_out_len, load_time, 0]) else: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, actual_in_len, actual_out_len, load_time, model.peak_memory]) -def get_continuation_input_str(in_len): - # in_len.txt maybe shorter than we need, - # use much longer context to make sure input length - test_length = min(in_len*2, 8192) - while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192: - test_length = test_length * 2 - # Force the test_length to be 8192, such that we can use 8192.txt - if test_length > 8192: - test_length = 8192 - return open(f"prompt/continuation/{test_length}.txt", 'r').read() +def get_continuation_input_str(in_len, tokenizer=None): + # keep 'utf-8' as character encoding mode + if tokenizer is not None: + if in_len > 128 and in_len <= 4096: + if in_len <= 2048: + input_str = open("prompt/continuation/longbench_2k.txt", 'r', encoding='utf-8').read() + else: + input_str = open("prompt/continuation/longbench_4k.txt", 'r', encoding='utf-8').read() + + input_ids = tokenizer.encode(input_str, return_tensors="pt") + if input_ids.shape[1] < in_len: + return open(f"prompt/continuation/8192.txt", 'r', encoding='utf-8').read() + else: + half_idx = in_len // 2 + input_ids_truncated = torch.cat((input_ids[:, :half_idx], input_ids[:, -(in_len-half_idx):]), dim=1) + return tokenizer.batch_decode(input_ids_truncated)[0] + + return open(f"prompt/continuation/8192.txt", 'r', encoding='utf-8').read() def preprocess_prompt(tokenizer, in_len, task): if task == 'summarize': if in_len == 512: - input_str = open(f"prompt/summarize/cnn_239.txt", 'r').read() + input_str = open(f"prompt/summarize/cnn_239.txt", 'r', encoding='utf-8').read() elif in_len == 1024: - input_str = open(f"prompt/summarize/cnn_615.txt", 'r').read() + input_str = open(f"prompt/summarize/cnn_615.txt", 'r', encoding='utf-8').read() elif in_len == 2048: - input_str = open(f"prompt/summarize/cnn_824.txt", 'r').read() + input_str = open(f"prompt/summarize/cnn_824.txt", 'r', encoding='utf-8').read() elif in_len <= 256: - input_str = open(f"prompt/summarize/cnn_64.txt", 'r').read() + input_str = open(f"prompt/summarize/cnn_64.txt", 'r', encoding='utf-8').read() else: - input_str = open(f"prompt/summarize/cnn_5618.txt", 'r').read() + input_str = open(f"prompt/summarize/cnn_5618.txt", 'r', encoding='utf-8').read() question = "Can you please summarize this article?" prompt_format = "[INST] Article:```{}``` \n\n Question: {} \n\n [/INST]" special_tokens_len = len(tokenizer.encode(prompt_format.format("", question), add_special_tokens=False)) @@ -111,19 +124,19 @@ def preprocess_prompt(tokenizer, in_len, task): input_ids = tokenizer.encode(final_prompt, return_tensors="pt", truncation=True, max_length=in_len) elif task == 'QA': if in_len == 512: - input_str = open(f"prompt/QA/orca_776.txt", 'r').read() + input_str = open(f"prompt/QA/orca_776.txt", 'r', encoding='utf-8').read() elif in_len == 1024: - input_str = open(f"prompt/QA/orca_99.txt", 'r').read() + input_str = open(f"prompt/QA/orca_99.txt", 'r', encoding='utf-8').read() elif in_len == 2048: - input_str = open(f"prompt/QA/orca_401.txt", 'r').read() + input_str = open(f"prompt/QA/orca_401.txt", 'r', encoding='utf-8').read() elif in_len == 4096: - input_str = open(f"prompt/QA/orca_497.txt", 'r').read() + input_str = open(f"prompt/QA/orca_497.txt", 'r', encoding='utf-8').read() else: raise ValueError("No corresponding prompt available now, will be added later.") input_ids = tokenizer.encode(input_str, return_tensors="pt") return input_ids -def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1, streaming=False, use_fp16_torch_dtype=False, lookahead=False, task='continuation'): +def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1, streaming=False, use_fp16_torch_dtype=False, lookahead=False, task='continuation', optimize_model=False): # TODO: make a parameter result= {} if test_api == 'transformer_int4': @@ -175,7 +188,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, elif test_api == 'pipeline_parallel_gpu': result = run_pipeline_parallel_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, cpu_embedding, fp16=use_fp16_torch_dtype) elif test_api == 'transformers_int4_npu_win': - result = transformers_int4_npu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size) + result = transformers_int4_npu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, optimize_model) else: invalidInputError(False, "Unknown test_api " + test_api + ", please check your config.yaml.") @@ -287,7 +300,7 @@ def run_transformer_int4(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -346,7 +359,7 @@ def run_pytorch_autocast_bf16(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -411,7 +424,7 @@ def run_optimize_model(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -468,13 +481,15 @@ def run_transformer_int4_gpu(repo_id, else: model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, - torch_dtype=torch_dtype).eval() + cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, cpu_embedding=cpu_embedding) + model = model.to('xpu') elif origin_repo_id in LLAMA_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') elif origin_repo_id in PHI3VISION_IDS: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, _attn_implementation="eager", @@ -482,6 +497,15 @@ def run_transformer_int4_gpu(repo_id, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + elif origin_repo_id in MINICPM_V_IDS: + model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True, + modules_to_not_convert=["vpm", "resampler"], + trust_remote_code=True, use_cache=True, + cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + model = model.llm else: if "4bit" in repo_id: model = AutoModelForCausalLM.load_low_bit(model_path, optimize_model=True, @@ -503,14 +527,13 @@ def run_transformer_int4_gpu(repo_id, cpu_embedding=cpu_embedding, torch_dtype=torch_dtype).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - model = model.to('xpu') + model = model.to('xpu') end = time.perf_counter() load_time = end - st print(">> loading of model costs {}s and {}GB".format(load_time, torch.xpu.memory.memory_reserved()/(1024**3))) - if not lookahead: + if not lookahead and os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) != "1": model = BenchmarkWrapper(model) result = {} @@ -520,7 +543,7 @@ def run_transformer_int4_gpu(repo_id, in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) if task == 'continuation': - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -532,6 +555,15 @@ def run_transformer_int4_gpu(repo_id, input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') actual_in_len = input_ids.shape[1] result[in_out] = [] + if not lookahead and os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) == "1": + from ipex_llm.transformers.lookup import PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD + if actual_in_len < PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD: + warnings.warn( + "All-in-one benchmark currently does not support IPEX_LLM_PERFORMANCE_MODE " + f"with actual input token length < {PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD}. " + f"Skip benchmarking in-out pair {in_out} for model {repo_id}." + ) + continue thread = threading.Thread(target=run_model_in_thread, args=(model, in_out, tokenizer, result, warm_up, num_beams, input_ids, out_len, actual_in_len, num_trials, load_time, lookahead)) thread.start() thread.join() @@ -546,7 +578,7 @@ def run_transformer_int4_gpu(repo_id, peak_mem = result[in_out][-1][6] streaming = 'N/A' use_fp16_torch_dtype = 'N/A' - with open(csv_name, mode='a', newline='') as file: + with open(csv_name, mode='a', newline='', encoding='utf-8') as file: csv_writer = csv.writer(file) file.seek(0, os.SEEK_END) global line_counter @@ -571,24 +603,31 @@ def transformers_int4_npu_win(repo_id, num_trials, num_beams, low_bit, - batch_size): + batch_size, + optimize_model): from ipex_llm.transformers.npu_model import AutoModel, AutoModelForCausalLM from transformers import AutoTokenizer, LlamaTokenizer model_path = get_model_path(repo_id, local_model_hub) + in_out_len = in_out_pairs[0].split("-") + max_output_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() if repo_id in CHATGLM_IDS: - model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval() + model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, + optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=True, + torch_dtype=torch.float16, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) elif repo_id in LLAMA_IDS: - model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, - use_cache=True).eval() + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16, + optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=True, + use_cache=True, attn_implementation="eager").eval() tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) else: - model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, - use_cache=True).eval() + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16, + optimize_model=optimize_model, max_output_len=max_output_len, max_prompt_len=int(in_out_len[0]), transpose_value_cache=True, + use_cache=True, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() load_time = end - st @@ -602,7 +641,7 @@ def transformers_int4_npu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -610,6 +649,7 @@ def transformers_int4_npu_win(repo_id, true_str = tokenizer.batch_decode(input_ids)[0] input_list = [true_str] * batch_size input_ids = tokenizer(input_list, return_tensors="pt").input_ids + input_ids = input_ids[:, :in_len] actual_in_len = input_ids.shape[1] result[in_out] = [] for i in range(num_trials + warm_up): @@ -673,7 +713,7 @@ def run_optimize_model_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -738,7 +778,7 @@ def run_ipex_fp16_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -809,7 +849,7 @@ def run_bigdl_fp16_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -895,7 +935,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -969,6 +1009,13 @@ def run_transformer_int4_gpu_win(repo_id, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in MINICPM_V_IDS: + model = AutoModel.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + modules_to_not_convert=["vpm", "resampler"], + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + model = model.llm else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval() @@ -988,7 +1035,7 @@ def run_transformer_int4_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1085,6 +1132,14 @@ def run_transformer_int4_fp16_gpu_win(repo_id, torch_dtype=torch.float16).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = model.to('xpu') + elif repo_id in MINICPM_V_IDS: + model = AutoModel.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, + modules_to_not_convert=["vpm", "resampler"], + trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, + torch_dtype=torch.float16).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = model.to('xpu') + model = model.llm else: model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit, trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding, @@ -1105,7 +1160,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1215,7 +1270,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1325,7 +1380,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1408,7 +1463,7 @@ def run_transformer_autocast_bf16( repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1473,7 +1528,7 @@ def run_bigdl_ipex_bf16(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1537,7 +1592,7 @@ def run_bigdl_ipex_int4(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1601,7 +1656,7 @@ def run_bigdl_ipex_int8(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1705,7 +1760,7 @@ def get_int_from_env(env_keys, default): in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1776,7 +1831,7 @@ def run_speculative_cpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1848,7 +1903,7 @@ def run_speculative_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1929,7 +1984,7 @@ def run_pipeline_parallel_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = get_continuation_input_str(in_len) + input_str = get_continuation_input_str(in_len, tokenizer) # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1968,18 +2023,22 @@ def run_pipeline_parallel_gpu(repo_id, streaming = False use_fp16_torch_dtype = False task = 'continuation' + optimize_model = False # only for transformers_int4_npu_win if 'streaming' in conf: streaming = conf['streaming'] if 'use_fp16_torch_dtype' in conf: use_fp16_torch_dtype = conf['use_fp16_torch_dtype'] if 'task' in conf: task = conf['task'] + if 'optimize_model' in conf: + optimize_model = conf['optimize_model'] lookahead = False import pandas as pd for api in conf.test_api: global csv_name csv_name = f'{current_dir}/{api}-results-{today}.csv' + try: line_counter = len(open(csv_name).readlines()) except: @@ -1991,6 +2050,9 @@ def run_pipeline_parallel_gpu(repo_id, for batch_size in batch_list: for model in conf.repo_id: in_out_pairs = conf['in_out_pairs'].copy() + print("-------------------- Start running batch_size: {} --------------------".format(batch_size)) + print("-------------------- Start running model: {} --------------------".format(model)) + print("--------------------in_out_pairs: {}--------------------".format(in_out_pairs)) if excludes: for in_out in conf['in_out_pairs']: model_id_input = model + ':' + in_out.split('-')[0] @@ -2000,24 +2062,31 @@ def run_pipeline_parallel_gpu(repo_id, if task in ['QA', 'summarize'] and conf['num_beams'] == 1 and batch_size == 1: lookahead = True run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], - conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, lookahead, task) + conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, lookahead, task, optimize_model) + print("-------------------- Finish running model: {} --------------------".format(model)) df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)', 'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype']) + print("-------------------- Results df:--------------------") + print(df) + print("-------------------- Results: {} --------------------".format(results)) + print("-------------------- csv_name: {} --------------------".format(csv_name)) + print(conf) if "pipeline" in api or "deepspeed" in api: if torch.distributed.get_rank() == 0: df.index += max(line_counter - 1, 0) if line_counter == 0: - df.to_csv(csv_name, mode='a') + df.to_csv(csv_name, mode='a', encoding='utf-8') else: - df.to_csv(csv_name, mode='a', header=None) + df.to_csv(csv_name, mode='a', header=None, encoding='utf-8') line_counter += len(df.index) else: df.index += max(line_counter - 1, 0) if api not in ["transformer_int4_gpu", "transformer_int4_fp16_gpu"]: if line_counter == 0: - df.to_csv(csv_name, mode='a') + df.to_csv(csv_name, mode='a', encoding='utf-8') else: - df.to_csv(csv_name, mode='a', header=None) + df.to_csv(csv_name, mode='a', header=None, encoding='utf-8') line_counter += len(df.index) results = [] + diff --git a/python/llm/dev/benchmark/ceval/README.md b/python/llm/dev/benchmark/ceval/README.md index 03e835674d1..97771ba5ed3 100644 --- a/python/llm/dev/benchmark/ceval/README.md +++ b/python/llm/dev/benchmark/ceval/README.md @@ -18,6 +18,7 @@ bash run.sh ``` + `run.sh` ```shell +export IPEX_LLM_LAST_LM_HEAD=0 python eval.py \ --model_path "path to model" \ --eval_type validation \ diff --git a/python/llm/dev/benchmark/ceval/run.sh b/python/llm/dev/benchmark/ceval/run.sh index 1a4b92ef934..19a4b457fe6 100644 --- a/python/llm/dev/benchmark/ceval/run.sh +++ b/python/llm/dev/benchmark/ceval/run.sh @@ -1,3 +1,5 @@ +export IPEX_LLM_LAST_LM_HEAD=0 + python eval.py \ --model_path "path to model" \ --eval_type validation \ diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index 50ec4b86f30..8f0d775cd85 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -15,15 +15,21 @@ pip install -e . run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evaluations easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py). ### Evaluation on CPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ### Evaluation on Intel GPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ### Evaluation using multiple Intel GPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_multi_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu:0,2,3 --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks. diff --git a/python/llm/dev/benchmark/perplexity/README.md b/python/llm/dev/benchmark/perplexity/README.md index 870c8dd8639..638bdab335b 100644 --- a/python/llm/dev/benchmark/perplexity/README.md +++ b/python/llm/dev/benchmark/perplexity/README.md @@ -1,27 +1,36 @@ # Perplexity Perplexity (PPL) is one of the most common metrics for evaluating language models. This benchmark implementation is adapted from [transformers/perplexity](https://huggingface.co/docs/transformers/perplexity#perplexity-of-fixed-length-models) and [benchmark_patch_llm.py](https://github.com/insuhan/hyper-attn/blob/main/benchmark_patch_llm.py) -## Run on Wikitext - -Download the dataset from [here](https://paperswithcode.com/dataset/wikitext-2), unzip it and we will use the test dataset `wiki.test.raw` for evaluation. - +## Environment Preparation ```bash -python run_wikitext.py --model_path meta-llama/Meta-Llama-3-8B/ --data_path wikitext-2-raw-v1/wikitext-2-raw/wiki.test.raw --precision sym_int4 --use-cache --device xpu +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install datasets +``` +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. -# Run with stride -python run_wikitext.py --model_path meta-llama/Meta-Llama-3-8B/ --data_path wikitext-2-raw-v1/wikitext-2-raw/wiki.test.raw --precision fp16 --device xpu --stride 512 +```bash +source /opt/intel/oneapi/setvars.sh ``` -## Run on [THUDM/LongBench](https://github.com/THUDM/LongBench) dataset +Please set IPEX_LLM_LAST_LM_HEAD=0 to disable the last_lm_head optimization. +```bash +export IPEX_LLM_LAST_LM_HEAD=0 +``` +## PPL Evaluation +### 1. Run on Wikitext +An example to run perplexity on [wikitext](https://paperswithcode.com/dataset/wikitext-2): ```bash -python run.py --model_path --precisions sym_int4 fp8 --device xpu --datasets dataset_names --dataset_path --language en +python run_wikitext.py --model_path meta-llama/Meta-Llama-3-8B --dataset path=wikitext,name=wikitext-2-raw-v1 --precision sym_int4 --device xpu --stride 512 --max_length 4096 ``` -A more specific example to run perplexity on Llama2-7B using the default English datasets: +### 2. Run on [THUDM/LongBench](https://github.com/THUDM/LongBench) dataset + +An example to run perplexity on chatglm3-6b using the default Chinese datasets("multifieldqa_zh", "dureader", "vcsum", "lsht", "passage_retrieval_zh") ```bash -python run.py --model_path meta-llama/Llama-2-7b-chat-hf --precisions float16 sym_int4 --device xpu --language en +python run_longbench.py --model_path THUDM/chatglm3-6b --precisions float16 sym_int4 --device xpu --language zh ``` + Notes: - If you want to test model perplexity on a few selected datasets from the `LongBench` dataset, please use the format below. ```bash diff --git a/python/llm/dev/benchmark/perplexity/run.py b/python/llm/dev/benchmark/perplexity/run_longbench.py similarity index 94% rename from python/llm/dev/benchmark/perplexity/run.py rename to python/llm/dev/benchmark/perplexity/run_longbench.py index 92b4999a8b4..c250d35f8d1 100644 --- a/python/llm/dev/benchmark/perplexity/run.py +++ b/python/llm/dev/benchmark/perplexity/run_longbench.py @@ -37,6 +37,7 @@ def get_arguments(): parser.add_argument("--dataset_path", required=False, type=str, default=None) parser.add_argument("--language", required=False, type=str, default="en", choices=['en', 'zh', 'all']) parser.add_argument("--precisions", required=False, type=str, default=None, nargs='+') + parser.add_argument("--mixed_precision", action="store_true") parser.add_argument("--device", type=str, default="xpu") parser.add_argument("--output_path", default=None) return parser.parse_args() @@ -95,11 +96,11 @@ def main(): log_dir = f"{output_path}/{model_name}/{args.device}/{precision}/{args.language}" os.makedirs(log_dir, exist_ok=True) results = {} - ppl_evaluator = BigDLPPL(model_path=args.model_path, device=args.device, **model_kwargs) + ppl_evaluator = BigDLPPL(model_path=args.model_path, device=args.device, mixed_precision=args.mixed_precision, **model_kwargs) ppl = ppl_evaluator.perplexity_hf(encoded_texts) summary[precision] = ppl results['results'] = ppl - results['config'] = {"model": model_name, "precision": precision, "device": args.device, "seq_len": args.seq_len, "language": args.language} + results['config'] = {"model": model_name, "precision": precision, "mixed_precision": args.mixed_precision, "device": args.device, "seq_len": args.seq_len, "language": args.language } dumped = json.dumps(results, indent=2) print(dumped) diff --git a/python/llm/dev/benchmark/perplexity/run_wikitext.py b/python/llm/dev/benchmark/perplexity/run_wikitext.py index 190d5114a86..061c87babb6 100644 --- a/python/llm/dev/benchmark/perplexity/run_wikitext.py +++ b/python/llm/dev/benchmark/perplexity/run_wikitext.py @@ -20,8 +20,7 @@ import argparse import torch from tqdm import tqdm -from datasets import concatenate_datasets, load_dataset -from ipex_llm.utils.common import invalidInputError +from datasets import load_dataset parser = argparse.ArgumentParser() @@ -34,18 +33,29 @@ parser.add_argument("--precision", type=str, default="sym_int4") parser.add_argument("--use-cache", action="store_true") parser.add_argument("--max_length", type=int, default=None) +parser.add_argument("--mixed_precision", action="store_true") args = parser.parse_args() if args.precision == "fp16": # ipex fp16 from transformers import AutoModelForCausalLM - if "xpu" in args.device: - import intel_extension_for_pytorch as ipex - model = AutoModelForCausalLM.from_pretrained(args.model_path, use_cache=args.use_cache, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(args.model_path, + use_cache=args.use_cache, + trust_remote_code=True) model = model.half() +elif 'gptq' in args.model_path.lower(): # ipex-llm gptq + from ipex_llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(args.model_path, + load_in_4bit=True, + torch_dtype=torch.float, + use_cache=args.use_cache, + trust_remote_code=True) else: # ipex-llm from ipex_llm.transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(args.model_path, load_in_low_bit=args.precision, - use_cache=args.use_cache, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(args.model_path, + load_in_low_bit=args.precision, + use_cache=args.use_cache, + trust_remote_code=True, + mixed_precision=args.mixed_precision) model = model.half() model = model.to(args.device) model = model.eval() @@ -64,6 +74,7 @@ def parse_kwargs(kwstr): data = f.read() encodings = tokenizer(data.decode("utf-8").strip("\n"), return_tensors="pt") else: + from ipex_llm.utils.common import invalidInputError raise invalidInputError(False, "Must specify either dataset or datapath.") if not args.max_length: diff --git a/python/llm/dev/benchmark/whisper/README.md b/python/llm/dev/benchmark/whisper/README.md index 189435db9c2..d2e6ed84f75 100644 --- a/python/llm/dev/benchmark/whisper/README.md +++ b/python/llm/dev/benchmark/whisper/README.md @@ -10,6 +10,7 @@ pip install datasets evaluate soundfile librosa jiwer ## Run ```bash +export IPEX_LLM_LAST_LM_HEAD=0 python run_whisper.py --model_path /path/to/model --data_type other --device cpu ``` diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/README.md new file mode 100644 index 00000000000..4e0955b82d2 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/README.md @@ -0,0 +1,101 @@ +# MiniCPM-V-2_6 +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM-V-2_6 models. For illustration purposes, we utilize the [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) as a reference MiniCPM-V-2_6 model. + +## 0. Requirements +To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `chat()` API +In the example [chat.py](./chat.py), we show a basic use case for a MiniCPM-V-2_6 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage environment: + +On Linux: + +```bash +conda create -n llm python=3.11 +conda activate llm + +# install ipex-llm with 'all' option +pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +pip install torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu +pip install transformers==4.40.0 trl +``` +On Windows: + +```cmd +conda create -n llm python=3.11 +conda activate llm + +pip install --pre --upgrade ipex-llm[all] +pip install torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu +pip install transformers==4.40.0 trl +``` + +### 2. Run + +- chat without streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' + ``` +- chat in streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' --stream + ``` + +> [!TIP] +> For chatting in streaming mode, it is recommended to set the environment variable `PYTHONUNBUFFERED=1`. + + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-V-2_6 model (e.g. `openbmb/MiniCPM-V-2_6`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-V-2_6'`. +- `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. +- `--stream`: flag to chat in streaming mode + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the MiniCPM model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machine, it is recommended to run directly with full utilization of all cores: +```cmd +python ./chat.py +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set IPEX-LLM env variables +source ipex-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./chat.py +``` + +#### 2.3 Sample Output +#### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) +```log +Inference time: xxxx s +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +What is in the image? +-------------------- Chat Output -------------------- +The image features a young child holding a white teddy bear dressed in pink. The background includes some red flowers and what appears to be a stone wall. +``` + +```log +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +图片里有什么? +-------------------- Stream Chat Output -------------------- +图片中有一个小女孩,她手里拿着一个穿着粉色裙子的白色小熊玩偶。背景中有红色花朵和石头结构,可能是一个花园或庭院。 +``` + +The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): + + diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/chat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/chat.py new file mode 100644 index 00000000000..a6f44bd0ed3 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6/chat.py @@ -0,0 +1,100 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import os +import time +import argparse +import requests +import torch +from PIL import Image +from ipex_llm.transformers import AutoModel +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for MiniCPM-V-2_6 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", + help='The huggingface repo id for the MiniCPM-V-2_6 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in the image?", + help='Prompt to infer') + parser.add_argument('--stream', action='store_true', + help='Whether to chat in streaming mode') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + image_path = args.image_url_or_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + model = AutoModel.from_pretrained(model_path, + load_in_low_bit="sym_int4", + optimize_model=True, + trust_remote_code=True, + use_cache=True, + torch_dtype=torch.float32, + modules_to_not_convert=["vpm", "resampler"]) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + model.eval() + + query = args.prompt + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + # Generate predicted tokens + # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md + msgs = [{'role': 'user', 'content': [image, args.prompt]}] + + if args.stream: + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + stream=True + ) + + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Stream Chat Output', '-'*20) + for new_text in res: + print(new_text, flush=True, end='') + else: + st = time.time() + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + ) + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Chat Output', '-'*20) + print(res) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md index 7dc3dedc5cb..7f5061eccd6 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md @@ -20,6 +20,7 @@ conda activate llm # install the latest ipex-llm nightly build with 'all' option pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` @@ -32,6 +33,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[all] +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib ``` diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md index cee06098d2d..992ea9ee10e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md @@ -22,6 +22,8 @@ conda activate llm # install the latest ipex-llm nightly build with 'all' option pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu + +pip install "transformers<4.37.0" pip install tiktoken einops transformers_stream_generator # additional package required for Qwen-7B-Chat to conduct generation ``` @@ -32,6 +34,8 @@ conda create -n llm python=3.11 conda activate llm pip install --pre --upgrade ipex-llm[all] + +pip install "transformers<4.37.0" pip install tiktoken einops transformers_stream_generator ``` diff --git a/python/llm/example/CPU/LlamaIndex/README.md b/python/llm/example/CPU/LlamaIndex/README.md index be50d92d02a..85c02d6433c 100644 --- a/python/llm/example/CPU/LlamaIndex/README.md +++ b/python/llm/example/CPU/LlamaIndex/README.md @@ -14,12 +14,16 @@ The RAG example ([rag.py](./rag.py)) is adapted from the [Official llama index R * **Install LlamaIndex Packages** ```bash - pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface + pip install llama-index-llms-ipex-llm==0.1.8 + pip install llama-index-embeddings-ipex-llm==0.1.5 + pip install llama-index-readers-file==0.1.33 + pip install llama-index-vector-stores-postgres==0.1.14 + pip install pymupdf ``` - -* **Install IPEX-LLM** -Ensure `ipex-llm` is installed by following the [IPEX-LLM Installation Guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install.html) before proceeding with the examples provided here. - +> [!NOTE] +> - You could refer [llama-index-llms-ipex-llm](https://docs.llamaindex.ai/en/stable/examples/llm/ipex_llm/) and [llama-index-embeddings-ipex-llm](https://docs.llamaindex.ai/en/stable/examples/embeddings/ipex_llm/) for more information. +> - The installation of `llama-index-llms-ipex-llm` or `llama-index-embeddings-ipex-llm` will also install `IPEX-LLM` and its dependencies. +> - `IpexLLMEmbedding` currently only provides optimization for Hugging Face Bge models. * **Database Setup (using PostgreSQL)**: * Installation: diff --git a/python/llm/example/CPU/LlamaIndex/rag.py b/python/llm/example/CPU/LlamaIndex/rag.py index 5759c624558..617fca8aa94 100644 --- a/python/llm/example/CPU/LlamaIndex/rag.py +++ b/python/llm/example/CPU/LlamaIndex/rag.py @@ -16,7 +16,6 @@ import torch -from llama_index.embeddings.huggingface import HuggingFaceEmbedding from sqlalchemy import make_url from llama_index.vector_stores.postgres import PGVectorStore # from llama_index.llms.llama_cpp import LlamaCPP @@ -161,10 +160,11 @@ def messages_to_prompt(messages): return prompt def main(args): - embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) + from llama_index.embeddings.ipex_llm import IpexLLMEmbedding + embed_model = IpexLLMEmbedding(model_name=args.embedding_model_path) # Use custom LLM in BigDL - from ipex_llm.llamaindex.llms import IpexLLM + from llama_index.llms.ipex_llm import IpexLLM llm = IpexLLM.from_model_id( model_name=args.model_path, tokenizer_name=args.tokenizer_path, diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md index 25744465c26..f6f5f1ffe8e 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md @@ -19,6 +19,8 @@ conda activate llm # install the latest ipex-llm nightly build with 'all' option pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu + +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` @@ -29,6 +31,8 @@ conda create -n llm python=3.11 conda activate llm pip install --pre --upgrade ipex-llm[all] + +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib ``` diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py index 23de5fa1acc..3d8d4ca9ea3 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py @@ -116,11 +116,13 @@ def load_model(model_path, low_bit): # Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format # Convert the rest of the model into float16 to reduce allreduce traffic model = optimize_model(model.module.to(f"cpu"), low_bit=low_bit).to(torch.float16) - + # Next, use XPU as accelerator to speed up inference current_accel = XPU_Accelerator() set_accelerator(current_accel) + model=model.eval() + # Move model back to xpu model = model.to(f"xpu:{local_rank}") model = BenchmarkWrapper(model) diff --git a/python/llm/example/GPU/HuggingFace/Advanced-Quantizations/GPTQ/generate.py b/python/llm/example/GPU/HuggingFace/Advanced-Quantizations/GPTQ/generate.py index c45963f59e7..50041d4173c 100644 --- a/python/llm/example/GPU/HuggingFace/Advanced-Quantizations/GPTQ/generate.py +++ b/python/llm/example/GPU/HuggingFace/Advanced-Quantizations/GPTQ/generate.py @@ -47,13 +47,10 @@ load_in_4bit=True, torch_dtype=torch.float, trust_remote_code=True,).to("xpu") - + # Load tokenizer - if "qwen" in model_path.lower(): - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - else: - tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) - + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + # Generate predicted tokens with torch.inference_mode(): prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) diff --git a/python/llm/example/GPU/HuggingFace/LLM/codegeex2/README.md b/python/llm/example/GPU/HuggingFace/LLM/codegeex2/README.md index 37f801a28bf..1bcdef0f391 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/codegeex2/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/codegeex2/README.md @@ -16,7 +16,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==4.31.0 ``` #### 1.2 Installation on Windows @@ -27,10 +26,20 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==4.31.0 ``` -### 2. Configures OneAPI environment variables for Linux +### 2. Download Model and Replace File +If you select the codegeex2-6b model ([THUDM/codegeex-6b](https://huggingface.co/THUDM/codegeex2-6b)), please note that their code (`tokenization_chatglm.py`) initialized tokenizer after the call of `__init__` of its parent class, which may result in error during loading tokenizer. To address issue, we have provided an updated file ([tokenization_chatglm.py](./codegeex2-6b/tokenization_chatglm.py)) + +```python +def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.tokenizer = SPTokenizer(vocab_file) + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) +``` + +You could download the model from [THUDM/codegeex-6b](https://huggingface.co/THUDM/codegeex2-6b), and replace the file `tokenization_chatglm.py` with [tokenization_chatglm.py](./codegeex2-6b/tokenization_chatglm.py). + +### 3. Configures OneAPI environment variables for Linux > [!NOTE] > Skip this step if you are running on Windows. @@ -41,7 +50,7 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this source /opt/intel/oneapi/setvars.sh ``` -### 3. Runtime Configurations +### 4. Runtime Configurations For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. #### 3.1 Configurations for Linux
@@ -105,7 +114,7 @@ set SYCL_CACHE_PERSISTENT=1 > [!NOTE] > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. -### 4. Running examples +### 5. Running examples ``` python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/codegeex2/codegeex2-6b/tokenization_chatglm.py b/python/llm/example/GPU/HuggingFace/LLM/codegeex2/codegeex2-6b/tokenization_chatglm.py new file mode 100644 index 00000000000..3759619ae80 --- /dev/null +++ b/python/llm/example/GPU/HuggingFace/LLM/codegeex2/codegeex2-6b/tokenization_chatglm.py @@ -0,0 +1,289 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# =========================================================================== +# +# This file is adapted from +# https://huggingface.co/THUDM/codegeex2-6b/blob/ee1e7db429e587645bd3f0f4c3f5d8e6e843f2f6/tokenization_chatglm.py +# +# Apache 2.0 license +# https://huggingface.co/THUDM/codegeex2-6b/blob/main/LICENSE + +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.tokenizer = SPTokenizer(vocab_file) + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_prompt(self, query, history=None): + if history is None: + history = [] + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) + prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) + return prompt + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + # assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if self.padding_side == "left": + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + + return encoded_inputs diff --git a/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py b/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py index c772d8c2bc8..05b14c2fafb 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py @@ -47,7 +47,7 @@ optimize_model=False, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/codellama/readme.md b/python/llm/example/GPU/HuggingFace/LLM/codellama/readme.md index f977a09b5fe..40cf921c3fc 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/codellama/readme.md +++ b/python/llm/example/GPU/HuggingFace/LLM/codellama/readme.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.34.1 # CodeLlamaTokenizer is supported in higher version of transformers ``` #### 1.2 Installation on Windows @@ -26,8 +24,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.34.1 # CodeLlamaTokenizer is supported in higher version of transformers ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HuggingFace/LLM/codeshell/README.md b/python/llm/example/GPU/HuggingFace/LLM/codeshell/README.md index e12e8163789..e6117152df1 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/codeshell/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/codeshell/README.md @@ -21,6 +21,12 @@ Suppose you have already configured GPU environment, you will need some extra pr ## 1. How to use this server +This is a required step on Linux for APT. Skip this step for PIP-installed oneAPI or if you are running on Windows. +```bash +source /opt/intel/oneapi/setvars.sh +``` + +Then run the following command in the terminal: ``` python server.py [--option value] ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/deciLM-7b/README.md b/python/llm/example/GPU/HuggingFace/LLM/deciLM-7b/README.md index 885cf792dce..728d534b07b 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/deciLM-7b/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/deciLM-7b/README.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by DeciLM-7B ``` #### 1.2 Installation on Windows @@ -26,8 +24,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by DeciLM-7B ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py b/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py index 7c3b2dacc70..d08fca0ae13 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py @@ -47,7 +47,7 @@ optimize_model=False, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/internlm2/README.md b/python/llm/example/GPU/HuggingFace/LLM/internlm2/README.md index f8906fb2880..6d16158f10d 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/internlm2/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/internlm2/README.md @@ -14,7 +14,7 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==3.36.2 +pip install einops pip install huggingface_hub ``` @@ -26,7 +26,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==3.36.2 +pip install einops pip install huggingface_hub ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/mistral/README.md b/python/llm/example/GPU/HuggingFace/LLM/mistral/README.md index 4de40cabb0f..63542bcfb76 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/mistral/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/mistral/README.md @@ -4,7 +4,6 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o ## Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. -**Important: According to [Mistral Troubleshooting](https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting), please make sure you have installed `transformers==4.34.0` to run the example.** ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mistral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. @@ -16,9 +15,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -# Refer to https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting, please make sure you are using a stable version of Transformers, 4.34.0 or newer. -pip install transformers==4.34.0 ``` #### 1.2 Installation on Windows @@ -29,9 +25,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -# Refer to https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting, please make sure you are using a stable version of Transformers, 4.34.0 or newer. -pip install transformers==4.34.0 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HuggingFace/LLM/qwen/README.md b/python/llm/example/GPU/HuggingFace/LLM/qwen/README.md index 500e2b0f2ad..8311f7f1369 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/qwen/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/qwen/README.md @@ -15,6 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install tiktoken einops transformers_stream_generator # additional package required for Qwen-7B-Chat to conduct generation ``` @@ -27,6 +28,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install tiktoken einops transformers_stream_generator # additional package required for Qwen-7B-Chat to conduct generation ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py b/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py index 25fdaeec16a..7d0d1ed072b 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/qwen2/generate.py @@ -19,7 +19,6 @@ import argparse from transformers import AutoTokenizer -from ipex_llm import optimize_model import numpy as np @@ -36,7 +35,7 @@ args = parser.parse_args() model_path = args.repo_id_or_model_path - + from ipex_llm.transformers import AutoModelForCausalLM # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format @@ -45,7 +44,7 @@ optimize_model=True, trust_remote_code=True, use_cache=True) - model = model.to("xpu") + model = model.half().to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/replit/README.md b/python/llm/example/GPU/HuggingFace/LLM/replit/README.md index 7c12b97707c..644de85aa3e 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/replit/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/replit/README.md @@ -15,7 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install "transformers<4.35" +pip install transformers<=4.33.3 ``` #### 1.2 Installation on Windows @@ -26,6 +26,8 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers<=4.33.3 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HuggingFace/LLM/solar/README.md b/python/llm/example/GPU/HuggingFace/LLM/solar/README.md index 811d712ea6f..b86044bb854 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/solar/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/solar/README.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by SOLAR ``` #### 1.2 Installation on Windows @@ -26,8 +24,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by SOLAR ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py b/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py index eb5e4aa42cc..25b692ca084 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/solar/generate.py @@ -47,7 +47,7 @@ load_in_4bit=True, trust_remote_code=True, use_cache=True) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md b/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md index 852c29dea34..7f4b9806a6c 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/vicuna/README.md @@ -1,5 +1,5 @@ # Vicuna -In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Vicuna models. For illustration purposes, we utilize the [lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) and [eachadea/vicuna-7b-1.1](https://huggingface.co/eachadea/vicuna-7b-1.1) as reference Vicuna models. +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Vicuna models. For illustration purposes, we utilize the [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) and [lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) as reference Vicuna models. ## 0. Requirements To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. @@ -109,7 +109,7 @@ python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROM ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Vicuna model (e.g. `lmsys/vicuna-13b-v1.3` and `eachadea/vicuna-7b-1.1`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'lmsys/vicuna-13b-v1.3'`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Vicuna model (e.g. `lmsys/vicuna-13b-v1.5` and `eachadea/vicuna-7b-v1.5`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'lmsys/vicuna-13b-v1.5'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. @@ -118,7 +118,7 @@ Arguments info: > Please select the appropriate size of the Vicuna model based on the capabilities of your machine. #### Sample Output -#### [lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) +#### [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) ```log Inference time: xxxx s -------------------- Prompt -------------------- @@ -130,10 +130,10 @@ What is AI? ### Human: What is AI? ### Assistant: -AI, or Artificial Intelligence, refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, +AI stands for Artificial Intelligence. It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception ``` -#### [eachadea/vicuna-7b-1.1](https://huggingface.co/eachadea/vicuna-7b-1.1) +#### [eachadea/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) ```log Inference time: xxxx s -------------------- Prompt -------------------- @@ -145,5 +145,5 @@ What is AI? ### Human: What is AI? ### Assistant: -AI, or artificial intelligence, refers to the ability of a machine or computer program to mimic human intelligence and perform tasks that would normally require human intelligence to +AI stands for "Artificial Intelligence." It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual per ``` diff --git a/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py b/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py index 1cf63a2cf74..e473a2dc63e 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/vicuna/generate.py @@ -27,8 +27,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Vicuna model') - parser.add_argument('--repo-id-or-model-path', type=str, default="lmsys/vicuna-13b-v1.3", - help='The huggingface repo id for the Vicuna (e.g. `lmsys/vicuna-13b-v1.3` and `eachadea/vicuna-7b-1.1`) to be downloaded' + parser.add_argument('--repo-id-or-model-path', type=str, default="lmsys/vicuna-13b-v1.5", + help='The huggingface repo id for the Vicuna (e.g. `lmsys/vicuna-13b-v1.5` and `lmsys/vicuna-7b-v1.5`) to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') @@ -57,7 +57,7 @@ # enabling `use_cache=True` allows the model to utilize the previous # key/values attentions to speed up decoding; # to obtain optimal performance with IPEX-LLM INT4 optimizations, - # it is important to set use_cache=True for vicuna-v1.3 models + # it is important to set use_cache=True for vicuna-v1.5 models output = model.generate(input_ids, use_cache=True, max_new_tokens=args.n_predict) diff --git a/python/llm/example/GPU/HuggingFace/LLM/yi/README.md b/python/llm/example/GPU/HuggingFace/LLM/yi/README.md index 1cda888832e..080e2676fdc 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/yi/README.md +++ b/python/llm/example/GPU/HuggingFace/LLM/yi/README.md @@ -1,5 +1,5 @@ # Yi -In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) as a reference Yi model. +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) and [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat) as reference Yi models. ## 0. Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. @@ -112,7 +112,7 @@ python ./generate.py In the example, several arguments can be passed to satisfy your requirements: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B'`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B` and `01-ai/Yi-6B-Chat`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B-Chat'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. @@ -127,3 +127,13 @@ AI是什么? AI是什么? 人工智能(Artificial Intelligence),英文缩写为AI。它是研究、开发用于模拟、延伸和扩展人的智能的理论、方法、技术及 ``` + +#### [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI是什么? +人工智能(Artificial Intelligence, AI)是计算机科学的一个分支,它研究如何让计算机模拟人类的智能行为。人工智能可以通过模仿人类的思维过程和 +``` \ No newline at end of file diff --git a/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py b/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py index f9a0e544bc7..643c5f7b34d 100644 --- a/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py +++ b/python/llm/example/GPU/HuggingFace/LLM/yi/generate.py @@ -21,18 +21,10 @@ from ipex_llm.transformers import AutoModelForCausalLM from transformers import AutoTokenizer -# Refer to https://huggingface.co/01-ai/Yi-6B-Chat#31-use-the-chat-model -YI_PROMPT_FORMAT = """ -<|im_start|>system -You are a helpful assistant. If you don't understand what the user means, ask the user to provide more information.<|im_end|> -<|im_start|>user -{prompt}<|im_end|> -<|im_start|>assistant -""" if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Yi model') - parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B", + parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B-Chat", help='The huggingface repo id for the Yi model to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="AI是什么?", @@ -60,7 +52,7 @@ # Generate predicted tokens with torch.inference_mode(): - prompt = YI_PROMPT_FORMAT.format(prompt=args.prompt) + prompt = args.prompt input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') # ipex_llm model needs a warmup, then inference time can be accurate output = model.generate(input_ids, diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/README.md index 8d88fbb23a6..ee653b58136 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/README.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/README.md @@ -5,7 +5,7 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. ## Example: Predict Tokens using `chat()` API -In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM-Llama3-V-2_5 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +In the example [chat.py](./chat.py), we show a basic use case for a MiniCPM-Llama3-V-2_5 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. ### 1. Install #### 1.1 Installation on Linux We suggest using conda to manage environment: @@ -106,15 +106,20 @@ set SYCL_CACHE_PERSISTENT=1 > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. ### 4. Running examples -``` -python ./generate.py --prompt 'What is in the image?' -``` +- chat without streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' + ``` +- chat in streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' --stream + ``` Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-Llama3-V-2_5 (e.g. `openbmb/MiniCPM-Llama3-V-2_5`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-Llama3-V-2_5'`. - `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. -- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--stream`: flag to chat in streaming mode #### Sample Output @@ -122,12 +127,21 @@ Arguments info: ```log Inference time: xxxx s --------------------- Input -------------------- +-------------------- Input Image -------------------- http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg --------------------- Prompt -------------------- +-------------------- Input Prompt -------------------- What is in the image? --------------------- Output -------------------- -The image features a young child holding a white teddy bear. The teddy bear is dressed in a pink outfit. The child appears to be outdoors, with a stone wall and some red flowers in the background. +-------------------- Chat Output -------------------- +The image features a young child holding a white teddy bear. The teddy bear is dressed in a pink dress with a ribbon on it. The child appears to be smiling and enjoying the moment. +``` +```log +Inference time: xxxx s +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +图片里有什么? +-------------------- Chat Output -------------------- +图片中有一个小孩,手里拿着一个白色的玩具熊。这个孩子看起来很开心,正在微笑并与玩具互动。背景包括红色的花朵和石墙,为这个场景增添了色彩和质感。 ``` The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/generate.py b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/chat.py similarity index 63% rename from python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/generate.py rename to python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/chat.py index e1bde9ee2bc..66aa46304db 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/generate.py +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5/chat.py @@ -14,10 +14,12 @@ # limitations under the License. # + import os import time import argparse import requests +import torch from PIL import Image from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer @@ -33,8 +35,8 @@ help='The URL or path to the image to infer') parser.add_argument('--prompt', type=str, default="What is in the image?", help='Prompt to infer') - parser.add_argument('--n-predict', type=int, default=32, - help='Max tokens to predict') + parser.add_argument('--stream', action='store_true', + help='Whether to chat in streaming mode') args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -45,12 +47,12 @@ # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = AutoModel.from_pretrained(model_path, - load_in_4bit=True, - optimize_model=False, + load_in_low_bit="sym_int4", + optimize_model=True, trust_remote_code=True, - modules_to_not_convert=["vpm", "resampler"], - use_cache=True) - model = model.float().to(device='xpu') + use_cache=True, + modules_to_not_convert=["vpm", "resampler"]) + model = model.half().to('xpu') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model.eval() @@ -62,23 +64,45 @@ image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') # Generate predicted tokens - # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/blob/main/README.md - msgs = [{'role': 'user', 'content': args.prompt}] - st = time.time() - res = model.chat( - image=image, - msgs=msgs, - context=None, - tokenizer=tokenizer, - sampling=False, - temperature=0.7 + # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md + msgs = [{'role': 'user', 'content': [image, args.prompt]}] + + # ipex_llm model needs a warmup, then inference time can be accurate + model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, ) - end = time.time() - print(f'Inference time: {end-st} s') - print('-'*20, 'Input', '-'*20) - print(image_path) - print('-'*20, 'Prompt', '-'*20) - print(args.prompt) - output_str = res - print('-'*20, 'Output', '-'*20) - print(output_str) + + if args.stream: + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + stream=True + ) + + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Stream Chat Output', '-'*20) + for new_text in res: + print(new_text, flush=True, end='') + else: + st = time.time() + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + ) + torch.xpu.synchronize() + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Chat Output', '-'*20) + print(res) diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/README.md index cc293ab9990..aed936fb277 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/README.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/README.md @@ -1,11 +1,11 @@ # MiniCPM-V-2 -In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM-V-2 models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2) as a reference MiniCPM-V-2 model. +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM-V-2 model on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2) as a reference MiniCPM-V-2 model. ## 0. Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. ## Example: Predict Tokens using `chat()` API -In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM-V-2 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +In the example [chat.py](./chat.py), we show a basic use case for a MiniCPM-V-2 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. ### 1. Install #### 1.1 Installation on Linux We suggest using conda to manage environment: @@ -106,15 +106,20 @@ set SYCL_CACHE_PERSISTENT=1 > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. ### 4. Running examples -``` -python ./generate.py --prompt 'What is in the image?' -``` +- chat without streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' + ``` +- chat in streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' --stream + ``` Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-V-2 (e.g. `openbmb/MiniCPM-V-2`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-V-2'`. - `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. -- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--stream`: flag to chat in streaming mode #### Sample Output @@ -122,12 +127,20 @@ Arguments info: ```log Inference time: xxxx s --------------------- Input -------------------- +-------------------- Input Image -------------------- http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg --------------------- Prompt -------------------- +-------------------- Input Prompt -------------------- What is in the image? --------------------- Output -------------------- -In the image, there is a young child holding a teddy bear. The teddy bear appears to be dressed in a pink tutu. The child is also wearing a red and white striped dress. The background of the image includes a stone wall and some red flowers. +-------------------- Chat Output -------------------- +In the image, there is a young child holding a teddy bear. The teddy bear is dressed in a pink tutu. The child is also wearing a red and white striped dress. The background of the image features a stone wall and some red flowers. +``` +```log +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +图片里有什么? +-------------------- Chat Output -------------------- +图中是一个小女孩,她手里拿着一只粉白相间的泰迪熊。 ``` The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/generate.py b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/chat.py similarity index 82% rename from python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/generate.py rename to python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/chat.py index 07b3021e0c2..93441c84bbb 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/generate.py +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2/chat.py @@ -15,6 +15,7 @@ # + from typing import List, Tuple, Optional, Union import math import timm @@ -110,6 +111,7 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: import time import argparse import requests +import torch from PIL import Image from ipex_llm.transformers import AutoModel from transformers import AutoTokenizer @@ -125,8 +127,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: help='The URL or path to the image to infer') parser.add_argument('--prompt', type=str, default="What is in the image?", help='Prompt to infer') - parser.add_argument('--n-predict', type=int, default=32, - help='Max tokens to predict') + parser.add_argument('--stream', action='store_true', + help='Whether to chat in streaming mode') args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -140,9 +142,9 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: load_in_low_bit="asym_int4", optimize_model=True, trust_remote_code=True, - modules_to_not_convert=["vpm", "resampler", "lm_head"], - use_cache=True) - model = model.float().to(device='xpu') + use_cache=True, + modules_to_not_convert=["vpm", "resampler"]) + model = model.half().to('xpu') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model.eval() @@ -156,7 +158,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: # Generate predicted tokens # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2/blob/main/README.md msgs = [{'role': 'user', 'content': args.prompt}] - st = time.time() + + # ipex_llm model needs a warmup, then inference time can be accurate res, context, _ = model.chat( image=image, msgs=msgs, @@ -165,12 +168,40 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: sampling=False, temperature=0.7 ) - end = time.time() - print(f'Inference time: {end-st} s') - print('-'*20, 'Input', '-'*20) - print(image_path) - print('-'*20, 'Prompt', '-'*20) - print(args.prompt) - output_str = res - print('-'*20, 'Output', '-'*20) - print(output_str) + if args.stream: + res, context, _ = model.chat( + image=image, + msgs=msgs, + context=None, + tokenizer=tokenizer, + sampling=False, + temperature=0.7 + ) + + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Stream Chat Output', '-'*20) + for new_text in res: + print(new_text, flush=True, end='') + else: + st = time.time() + res, context, _ = model.chat( + image=image, + msgs=msgs, + context=None, + tokenizer=tokenizer, + sampling=False, + temperature=0.7 + ) + torch.xpu.synchronize() + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Chat Output', '-'*20) + print(res) diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md new file mode 100644 index 00000000000..6063a286b4a --- /dev/null +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/README.md @@ -0,0 +1,150 @@ +# MiniCPM-V-2_6 +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM-V-2_6 model on [Intel GPUs](../../../README.md). For illustration purposes, we utilize [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) as reference MiniCPM-V-2_6 model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `chat()` API +In the example [chat.py](./chat.py), we show a basic use case for a MiniCPM-V-2_6 model to predict the next N tokens using `chat()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.40.0 trl +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.40.0 trl +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +- chat without streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' + ``` +- chat in streaming mode: + ``` + python ./chat.py --prompt 'What is in the image?' --stream + ``` + +> [!TIP] +> For chatting in streaming mode, it is recommended to set the environment variable `PYTHONUNBUFFERED=1`. + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM-V-2_6 (e.g. `openbmb/MiniCPM-V-2_6`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-V-2_6'`. +- `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. +- `--stream`: flag to chat in streaming mode + +#### Sample Output + +#### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) + +```log +Inference time: xxxx s +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +What is in the image? +-------------------- Chat Output -------------------- +The image features a young child holding a white teddy bear wearing a pink dress. The background shows some red flowers and stone walls, suggesting an outdoor setting. +``` +```log +-------------------- Input Image -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Input Prompt -------------------- +图片里有什么? +-------------------- Stream Chat Output -------------------- +图片中有一个穿着粉红色连衣裙的小孩,手里拿着一只穿着粉色芭蕾裙的白色泰迪熊。背景中有红色花朵和石头墙,表明照片可能是在户外拍摄的。 +``` +The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): + + diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py new file mode 100644 index 00000000000..a698cd9d457 --- /dev/null +++ b/python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6/chat.py @@ -0,0 +1,108 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import os +import time +import argparse +import requests +import torch +from PIL import Image +from ipex_llm.transformers import AutoModel +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", + help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in the image?", + help='Prompt to infer') + parser.add_argument('--stream', action='store_true', + help='Whether to chat in streaming mode') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + image_path = args.image_url_or_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. + model = AutoModel.from_pretrained(model_path, + load_in_low_bit="sym_int4", + optimize_model=True, + trust_remote_code=True, + use_cache=True, + modules_to_not_convert=["vpm", "resampler"]) + model = model.half().to('xpu') + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + model.eval() + + query = args.prompt + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + # Generate predicted tokens + # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md + msgs = [{'role': 'user', 'content': [image, args.prompt]}] + + # ipex_llm model needs a warmup, then inference time can be accurate + model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + ) + + if args.stream: + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + stream=True + ) + + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Stream Chat Output', '-'*20) + for new_text in res: + print(new_text, flush=True, end='') + else: + st = time.time() + res = model.chat( + image=None, + msgs=msgs, + tokenizer=tokenizer, + ) + torch.xpu.synchronize() + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input Image', '-'*20) + print(image_path) + print('-'*20, 'Input Prompt', '-'*20) + print(args.prompt) + print('-'*20, 'Chat Output', '-'*20) + print(res) diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/qwen-vl/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/qwen-vl/README.md index fb02816b1f0..737232661fd 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/qwen-vl/README.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/qwen-vl/README.md @@ -15,6 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` @@ -27,6 +28,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/README.md new file mode 100644 index 00000000000..b201467a138 --- /dev/null +++ b/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/README.md @@ -0,0 +1,127 @@ +# Qwen2-Audio +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Qwen2-Audio models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize [Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct) as reference model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a Qwen2-Audio model to conduct transcription using `processor` API, then use the recoginzed text as the input for Qwen2-Audio model to perform an English-Chinese translation using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install + +> [!NOTE] +> Qwen2-Audio requires minimal `transformers` version of 4.35.0, which is not yet released. Currently, you can install the latest version of `transformers` from GitHub. When such a version is released, you can install it using `pip install transformers==4.35.0`. + +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install librosa +pip install git+https://github.com/huggingface/transformers +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install librosa +pip install git+https://github.com/huggingface/transformers +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Qwen2-Audio model (e.g. `Qwen/Qwen2-Audio-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'Qwen/Qwen2-Audio-7B-Instruct'`. + +#### Sample Output +In `generate.py`, [an audio clip](https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav) is used as the input, which asks the model to translate an English sentence into Chinese. The response from the model is expected to be similar to: +```bash +['每个人都希望被赏识,所以如果你欣赏某人,不要保密。'] +``` diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/generate.py b/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/generate.py new file mode 100644 index 00000000000..fd186f3e563 --- /dev/null +++ b/python/llm/example/GPU/HuggingFace/Multimodal/qwen2-audio/generate.py @@ -0,0 +1,75 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +from io import BytesIO +from urllib.request import urlopen +import librosa +import torch +from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor +from ipex_llm import optimize_model + +def main(args): + model_path = args.repo_id_or_model_path + max_length = args.max_length + audio_url = args.audio_url + + processor = AutoProcessor.from_pretrained(model_path) + model = Qwen2AudioForConditionalGeneration.from_pretrained(model_path) + model = optimize_model(model, low_bit='sym_int4', optimize_llm=True) + model = model.half().to('xpu') + + conversation = [ + {"role": "user", "content": [ + {"type": "audio", "audio_url": audio_url}, + ]}, + ] + text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + audios = [] + for message in conversation: + if isinstance(message["content"], list): + for ele in message["content"]: + if ele["type"] == "audio": + audios.append(librosa.load( + BytesIO(urlopen(ele['audio_url']).read()), + sr=processor.feature_extractor.sampling_rate)[0] + ) + + inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) + inputs = inputs.to('xpu') + + with torch.inference_mode(): + generate_ids = model.generate(**inputs, max_length=max_length) # warmup + import time + st = time.time() + generate_ids = model.generate(**inputs, max_length=max_length) + generate_ids = generate_ids[:, inputs.input_ids.size(1):] + et = time.time() + print(f'Inference time: {et-st} s') + + response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + print(response) + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="Qwen2-Audio") + parser.add_argument('--repo-id-or-model-path', type=str, default="Qwen/Qwen2-Audio-7B-Instruct", + help='The huggingface repo id for the Qwen2-Audio model checkpoint') + parser.add_argument('--max-length', type=int, default=256, + help='The max length of input text') + parser.add_argument('--audio-url', type=str, default="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav", + help='The URL to the input audio file') + args = parser.parse_args() + main(args) diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/voiceassistant/README.md b/python/llm/example/GPU/HuggingFace/Multimodal/voiceassistant/README.md index 67c0fb26249..7dea109b078 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/voiceassistant/README.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/voiceassistant/README.md @@ -17,6 +17,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install transformers==4.36.2 pip install librosa soundfile datasets pip install accelerate pip install SpeechRecognition sentencepiece colorama @@ -33,6 +34,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install transformers==4.36.2 pip install librosa soundfile datasets pip install accelerate pip install SpeechRecognition sentencepiece colorama diff --git a/python/llm/example/GPU/HuggingFace/Multimodal/whisper/readme.md b/python/llm/example/GPU/HuggingFace/Multimodal/whisper/readme.md index 29a4dc4619c..ac664fb0a36 100644 --- a/python/llm/example/GPU/HuggingFace/Multimodal/whisper/readme.md +++ b/python/llm/example/GPU/HuggingFace/Multimodal/whisper/readme.md @@ -16,6 +16,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install transformers==4.36.2 pip install datasets soundfile librosa # required by audio processing ``` @@ -28,6 +29,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install transformers==4.36.2 pip install datasets soundfile librosa # required by audio processing ``` diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 61916fff9b7..c1df15db3be 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -144,6 +144,14 @@ def train( prompter = Prompter(prompt_template_name) + if deepspeed is not None and "zero3" in deepspeed: + from ipex_llm.transformers.utils \ + import _constant_buffered_norm2 + from ipex_llm.llm_patching import replace_attr + import deepspeed as ds + replace_attr(ds.runtime.zero.stage3.DeepSpeedZeroOptimizer_Stage3, + "_constant_buffered_norm2", _constant_buffered_norm2) + device_map = "auto" world_size = int(os.environ.get("WORLD_SIZE", 1)) ddp = world_size != 1 @@ -161,7 +169,7 @@ def train( optimize_model=False, torch_dtype=torch.bfloat16, modules_to_not_convert=["lm_head"], - trust_remote_code=True, + trust_remote_code=True ) else: # According to the QLoRA paper, using "nf4" could yield better model quality than "int4" @@ -186,9 +194,10 @@ def train( # # device_map=device_map, # modules_to_not_convert=["lm_head"], # ) - print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") - model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}') - print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") + if deepspeed is not None and not "zero3" in deepspeed: + print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") + model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}') + print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json new file mode 100644 index 00000000000..7ee8a787c0b --- /dev/null +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json @@ -0,0 +1,15 @@ +{ + "zero_optimization": { + "stage": 3, + "contiguous_gradients": true, + "overlap_comm": true, + "offload_optimizer": {"device": "cpu"} + }, + "bf16": { + "enabled": true + }, + "world_size": 2, + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 8 +} diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arc_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arc_2_card.sh new file mode 100644 index 00000000000..ba5a11b03b0 --- /dev/null +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arc_2_card.sh @@ -0,0 +1,41 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=29503 +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi +export CCL_ZE_IPC_EXCHANGE=sockets +export UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0 +basekit_root=/opt/intel/oneapi +source $basekit_root/setvars.sh --force +source $basekit_root/ccl/latest/env/vars.sh --force + +NUM_GPUS=2 # number of used GPU +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +export TORCH_LLM_ALLREDUCE=0 # Different from PVC +export DS_SKIP_CUDA_CHECK=1 + +mpirun -n $NUM_GPUS \ + python -u ./alpaca_qlora_finetuning.py \ + --base_model "meta-llama/Llama-2-13b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./ipex-llm-qlora-alpaca" \ + --gradient_checkpointing True \ + --micro_batch_size 2 \ + --batch_size 32 \ + --deepspeed ./deepspeed_zero3.json diff --git a/python/llm/example/GPU/Lightweight-Serving/README.md b/python/llm/example/GPU/Lightweight-Serving/README.md index 4cb29db1efc..c21aa880bfd 100644 --- a/python/llm/example/GPU/Lightweight-Serving/README.md +++ b/python/llm/example/GPU/Lightweight-Serving/README.md @@ -22,6 +22,10 @@ conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc # for internlm-xcomposer2-vl-7b pip install transformers==4.31.0 pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops + +# for whisper-large-v3 +pip install transformers==4.36.2 +pip install datasets soundfile librosa # required by audio processing ``` #### 1.2 Installation on Windows @@ -35,6 +39,14 @@ pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-exte pip install fastapi uvicorn openai pip install gradio # for gradio web UI conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc + +# for internlm-xcomposer2-vl-7b +pip install transformers==4.31.0 +pip install accelerate timm==0.4.12 sentencepiece==0.1.99 gradio==3.44.4 markdown2==2.4.10 xlsxwriter==3.1.2 einops + +# for whisper-large-v3 +pip install transformers==4.36.2 +pip install datasets soundfile librosa # required by audio processing ``` ### 2. Configures OneAPI environment variables for Linux @@ -180,7 +192,7 @@ curl http://localhost:8000/v1/chat/completions \ image input only supports [internlm-xcomposer2-vl-7b](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) now, and it must install transformers==4.31.0 to run. ```bash -wget -O ./test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +wget -O /llm/lightweight_serving/test.jpg http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ @@ -219,6 +231,17 @@ curl http://localhost:8000/v1/completions \ }' ``` +#### v1/audio/transcriptions + +ASR only supports [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) now. And `whisper-large-v3` just can be used to transcription audio. The audio file_type should be supported by `librosa.load`. +```bash +curl http://localhost:8000/v1/audio/transcriptions \ + -H "Content-Type: multipart/form-data" \ + -F file="@/llm/test.mp3" \ + -F model="whisper-large-v3" \ + -F languag="zh" +``` + ### 6. Benchmark with wrk Please refer to [here](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-Serving#4-benchmark-with-wrk) for more details diff --git a/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py b/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py index 003307a198f..ce579213553 100644 --- a/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py +++ b/python/llm/example/GPU/Lightweight-Serving/lightweight_serving.py @@ -39,12 +39,19 @@ async def main(): model_path = args.repo_id_or_model_path low_bit = args.low_bit - local_model = ModelWorker(model_path, low_bit) - # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left') - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - myapp = FastApp(local_model, tokenizer) + processor = None + if "whisper" not in model_path.lower(): + local_model = ModelWorker(model_path, low_bit) + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left') + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + else: + local_model = ModelWorker(model_path, low_bit, "audio", torch_dtype=torch.float32) + from transformers import WhisperProcessor + processor = WhisperProcessor.from_pretrained(model_path) + tokenizer = processor.tokenizer + myapp = FastApp(local_model, tokenizer, processor) config = uvicorn.Config(app=myapp.app, host="0.0.0.0", port=args.port) server = uvicorn.Server(config) await server.serve() diff --git a/python/llm/example/GPU/LlamaIndex/README.md b/python/llm/example/GPU/LlamaIndex/README.md index 53d5d7ddb69..a56ed793bba 100644 --- a/python/llm/example/GPU/LlamaIndex/README.md +++ b/python/llm/example/GPU/LlamaIndex/README.md @@ -8,17 +8,31 @@ This folder contains examples showcasing how to use [**LlamaIndex**](https://git ## Retrieval-Augmented Generation (RAG) Example The RAG example ([rag.py](./rag.py)) is adapted from the [Official llama index RAG example](https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval.html). This example builds a pipeline to ingest data (e.g. llama2 paper in pdf format) into a vector database (e.g. PostgreSQL), and then build a retrieval pipeline from that vector database. +### 1. Install Prerequisites +To benefit from IPEX-LLM on Intel GPUs, there are several prerequisite steps for tools installation and environment preparation. -### 1. Setting up Dependencies +If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](../../../../../docs/mddocs/Quickstart/install_windows_gpu.md), and follow [Install Prerequisites](../../../../../docs/mddocs/Quickstart/install_windows_gpu.md#install-prerequisites) to update GPU driver (optional) and install Conda. + +If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](../../../../../docs/mddocs/Quickstart/install_linux_gpu.md), and follow [Install Prerequisites](../../../../../docs/mddocs/Quickstart/install_linux_gpu.md#install-prerequisites) to install GPU driver, Intel® oneAPI Base Toolkit 2024.0, and Conda. + + +### 2. Setting up Dependencies * **Install LlamaIndex Packages** ```bash - pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface + conda activate + pip install llama-index-llms-ipex-llm[xpu]==0.1.8 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + pip install llama-index-embeddings-ipex-llm[xpu]==0.1.5 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + pip install llama-index-readers-file==0.1.33 + pip install llama-index-vector-stores-postgres==0.1.14 + pip install pymupdf ``` -* **Install IPEX-LLM** - - Follow the instructions in [GPU Install Guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install.html) to install ipex-llm. +> [!NOTE] +> - You could refer [llama-index-llms-ipex-llm](https://docs.llamaindex.ai/en/stable/examples/llm/ipex_llm_gpu/) and [llama-index-embeddings-ipex-llm](https://docs.llamaindex.ai/en/stable/examples/embeddings/ipex_llm_gpu/) for more information. +> - The installation of `llama-index-llms-ipex-llm` or `llama-index-embeddings-ipex-llm` will also install `IPEX-LLM` and its dependencies. +> - You can also use `https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/` as the `extra-indel-url`. +> - `IpexLLMEmbedding` currently only provides optimization for Hugging Face Bge models. * **Database Setup (using PostgreSQL)**: * Linux @@ -71,7 +85,7 @@ The RAG example ([rag.py](./rag.py)) is adapted from the [Official llama index R wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf" ``` -### 2. Configures OneAPI environment variables for Linux +### 3. Configures OneAPI environment variables for Linux > [!NOTE] > Skip this step if you are running on Windows. @@ -82,9 +96,9 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this source /opt/intel/oneapi/setvars.sh ``` -### 3. Runtime Configurations +### 4. Runtime Configurations For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. -#### 3.1 Configurations for Linux +#### 4.1 Configurations for Linux
For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series @@ -121,7 +135,7 @@ export BIGDL_LLM_XMX_DISABLED=1
-#### 3.2 Configurations for Windows +#### 4.2 Configurations for Windows
For Intel iGPU @@ -147,7 +161,7 @@ set SYCL_CACHE_PERSISTENT=1 > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. -### 4. Running the RAG example +### 5. Running the RAG example In the current directory, run the example with command: @@ -164,7 +178,7 @@ python rag.py -m -t - `-n N_PREDICT`: max predict tokens - `-t TOKENIZER_PATH`: **Required**, path to the tokenizer model -### 5. Example Output +### 6. Example Output A query such as **"How does Llama 2 compare to other open-source models?"** with the Llama2 paper as the data source, using the `Llama-2-7b-chat-hf` model, will produce the output like below: @@ -178,6 +192,6 @@ However, it's important to note that the performance of Llama 2 can vary dependi In conclusion, while Llama 2 performs well on most benchmarks compared to other open-source models, its performance ``` -### 6. Trouble shooting -#### 6.1 Core dump +### 7. Trouble shooting +#### 7.1 Core dump If you encounter a core dump error in your Python code, it is crucial to verify that the `import torch` statement is placed at the top of your Python file, just as what we did in `rag.py`. \ No newline at end of file diff --git a/python/llm/example/GPU/LlamaIndex/rag.py b/python/llm/example/GPU/LlamaIndex/rag.py index fef3204702e..37f2d0c2e31 100644 --- a/python/llm/example/GPU/LlamaIndex/rag.py +++ b/python/llm/example/GPU/LlamaIndex/rag.py @@ -15,7 +15,6 @@ # import torch -from llama_index.embeddings.huggingface import HuggingFaceEmbedding from sqlalchemy import make_url from llama_index.vector_stores.postgres import PGVectorStore # from llama_index.llms.llama_cpp import LlamaCPP @@ -160,10 +159,11 @@ def messages_to_prompt(messages): return prompt def main(args): - embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) + from llama_index.embeddings.ipex_llm import IpexLLMEmbedding + embed_model = IpexLLMEmbedding(model_name=args.embedding_model_path, device="xpu") # Use custom LLM in BigDL - from ipex_llm.llamaindex.llms import IpexLLM + from llama_index.llms.ipex_llm import IpexLLM llm = IpexLLM.from_model_id( model_name=args.model_path, tokenizer_name=args.tokenizer_path, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/README.md b/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/README.md index 37f801a28bf..6dd5e0799b5 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/README.md @@ -16,7 +16,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==4.31.0 ``` #### 1.2 Installation on Windows @@ -27,10 +26,20 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install transformers==4.31.0 ``` -### 2. Configures OneAPI environment variables for Linux +### 2. Download Model and Replace File (optional) +If you select the codegeex2-6b model ([THUDM/codegeex-6b](https://huggingface.co/THUDM/codegeex2-6b)), please note that their code (`tokenization_chatglm.py`) initialized tokenizer after the call of `__init__` of its parent class, which may result in error during loading tokenizer. To address issue, we have provided an updated file ([tokenization_chatglm.py](./codegeex2-6b/tokenization_chatglm.py)) + +```python +def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.tokenizer = SPTokenizer(vocab_file) + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) +``` + +You could download the model from [THUDM/codegeex-6b](https://huggingface.co/THUDM/codegeex2-6b), and replace the file `tokenization_chatglm.py` with [tokenization_chatglm.py](./codegeex2-6b/tokenization_chatglm.py). + +### 3. Configures OneAPI environment variables for Linux > [!NOTE] > Skip this step if you are running on Windows. @@ -41,7 +50,7 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this source /opt/intel/oneapi/setvars.sh ``` -### 3. Runtime Configurations +### 4. Runtime Configurations For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. #### 3.1 Configurations for Linux
@@ -105,7 +114,7 @@ set SYCL_CACHE_PERSISTENT=1 > [!NOTE] > For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. -### 4. Running examples +### 5. Running examples ``` python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT ``` diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/codegeex2-6b/tokenization_chatglm.py b/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/codegeex2-6b/tokenization_chatglm.py new file mode 100644 index 00000000000..3759619ae80 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/codegeex2/codegeex2-6b/tokenization_chatglm.py @@ -0,0 +1,289 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# =========================================================================== +# +# This file is adapted from +# https://huggingface.co/THUDM/codegeex2-6b/blob/ee1e7db429e587645bd3f0f4c3f5d8e6e843f2f6/tokenization_chatglm.py +# +# Apache 2.0 license +# https://huggingface.co/THUDM/codegeex2-6b/blob/main/LICENSE + +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.tokenizer = SPTokenizer(vocab_file) + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_prompt(self, query, history=None): + if history is None: + history = [] + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) + prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) + return prompt + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + # assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if self.padding_side == "left": + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + + return encoded_inputs diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/README.md b/python/llm/example/GPU/PyTorch-Models/Model/codellama/README.md index 497a6828b24..ff68817eca4 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/README.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.34.1 # CodeLlamaTokenizer is supported in higher version of transformers ``` #### 1.2 Installation on Windows @@ -26,8 +24,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.34.1 # CodeLlamaTokenizer is supported in higher version of transformers ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py index d676666b827..04af3a0221e 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py @@ -50,7 +50,7 @@ # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/README.md b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/README.md index ff8eab5ae09..a9e66f54732 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/deciLM-7b/README.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by DeciLM-7B ``` #### 1.2 Installation on Windows diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py index 516903171b1..e8098e49559 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -46,7 +46,7 @@ use_cache=True) model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md index 461ae53a8dd..77e0f1cfd9c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/llava/README.md @@ -16,7 +16,6 @@ conda activate llm pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install einops # install dependencies required by llava -pip install transformers==4.36.2 git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary cp generate.py ./LLaVA/ # copy our example to the LLaVA folder @@ -34,7 +33,6 @@ conda activate llm pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install einops # install dependencies required by llava -pip install transformers==4.36.2 git clone https://github.com/haotian-liu/LLaVA.git # clone the llava libary copy generate.py .\LLaVA\ # copy our example to the LLaVA folder diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/README.md b/python/llm/example/GPU/PyTorch-Models/Model/mistral/README.md index 4fc017e1ba7..4f3e58b045c 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/README.md @@ -4,7 +4,6 @@ In this directory, you will find examples on how you could use IPEX-LLM `optimiz ## Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. -**Important: According to [Mistral Troubleshooting](https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting), please make sure you have installed `transformers==4.34.0` to run the example.** ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mistral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. @@ -16,9 +15,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -# Refer to https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting, please make sure you are using a stable version of Transformers, 4.34.0 or newer. -pip install transformers==4.34.0 ``` #### 1.2 Installation on Windows @@ -29,9 +25,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -# Refer to https://huggingface.co/mistralai/Mistral-7B-v0.1#troubleshooting, please make sure you are using a stable version of Transformers, 4.34.0 or newer. -pip install transformers==4.34.0 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py index 0e717730edc..377e31569a0 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py @@ -49,7 +49,7 @@ # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/README.md b/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/README.md new file mode 100644 index 00000000000..5fdd8969b81 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/README.md @@ -0,0 +1,142 @@ +# Whisper + +In this directory, you will find examples of how to use IPEX-LLM to optimize OpenAI Whisper models within the `openai-whisper` Python library. For illustration purposes, we utilize the [whisper-tiny](https://github.com/openai/whisper/blob/main/model-card.md) as a reference Whisper model. + +## Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Recognize Tokens using `transcribe()` API +In the example [recognize.py](./recognize.py), we show a basic use case for a Whisper model to conduct transcription using `transcribe()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install -U openai-whisper +pip install librosa # required by audio processing +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install -U openai-whisper +pip install librosa +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +```bash +python ./recognize.py --audio-file AUDIO_FILE +``` + +Arguments info: +- `--model-name MODEL_NAME`: argument defining the model name(tiny, medium, base, etc.) for the Whisper model to be downloaded. It is one of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict. It is default to be `'tiny'`. +- `--audio-file AUDIO_FILE`: argument defining the path of the audio file to be recognized. +- `--language LANGUAGE`: argument defining language to be transcribed. It is default to be `english`. + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the Whisper model based on the capabilities of your machine. + +#### Sample Output +#### [whisper-tiny](https://github.com/openai/whisper/blob/main/model-card.md) + +For audio file(.wav) download from https://www.youtube.com/watch?v=-LIIf7E-qFI, it should be extracted as: +```log +[00:00.000 --> 00:10.000] I don't know who you are. +[00:10.000 --> 00:15.000] I don't know what you want. +[00:15.000 --> 00:21.000] If you're looking for ransom, I can tell you I don't know money, but what I do have. +[00:21.000 --> 00:24.000] I'm a very particular set of skills. +[00:24.000 --> 00:27.000] The skills I have acquired are very long career. +[00:27.000 --> 00:31.000] The skills that make me a nightmare for people like you. +[00:31.000 --> 00:35.000] If you let my daughter go now, that'll be the end of it. +[00:35.000 --> 00:39.000] I will not look for you. I will not pursue you. +[00:39.000 --> 00:45.000] But if you don't, I will look for you. I will find you. +[00:45.000 --> 00:48.000] And I will kill you. +[00:48.000 --> 00:53.000] Good luck. +Inference time: xxxx s +-------------------- Output -------------------- + I don't know who you are. I don't know what you want. If you're looking for ransom, I can tell you I don't know money, but what I do have. I'm a very particular set of skills. The skills I have acquired are very long career. The skills that make me a nightmare for people like you. If you let my daughter go now, that'll be the end of it. I will not look for you. I will not pursue you. But if you don't, I will look for you. I will find you. And I will kill you. Good luck. +``` diff --git a/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/recognize.py b/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/recognize.py new file mode 100644 index 00000000000..18c1b2e99e2 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/openai-whisper/recognize.py @@ -0,0 +1,59 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import whisper +import time +import librosa +import argparse +from ipex_llm import optimize_model + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Recognize Tokens using `transcribe()` API for Openai Whisper model') + parser.add_argument('--model-name', type=str, default="tiny", + help="The model name(tiny, medium, base, etc.) for the Whisper model to be downloaded." + "It is one of the official model names listed by `whisper.available_models()`, or" + "path to a model checkpoint containing the model dimensions and the model state_dict.") + parser.add_argument('--audio-file', type=str, required=True, + help='The path of the audio file to be recognized.') + parser.add_argument('--language', type=str, default="English", + help='language to be transcribed') + args = parser.parse_args() + + # Load the input audio + y, sr = librosa.load(args.audio_file) + + # Downsample the audio to 16kHz + target_sr = 16000 + audio = librosa.resample(y, + orig_sr=sr, + target_sr=target_sr) + + # Load whisper model under pytorch framework + model = whisper.load_model(args.model_name) + + # With only one line to enable IPEX-LLM optimize on a pytorch model + model = optimize_model(model) + + model = model.to('xpu') + + st = time.time() + result = model.transcribe(audio, verbose=True, language=args.language) + end = time.time() + print(f'Inference time: {end-st} s') + + print('-'*20, 'Output', '-'*20) + print(result["text"]) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/README.md b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/README.md index 5f9a617aaa3..c480c545366 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen-vl/README.md @@ -15,6 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` @@ -27,6 +28,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install "transformers<4.37.0" pip install accelerate tiktoken einops transformers_stream_generator==0.0.4 scipy torchvision pillow tensorboard matplotlib # additional package required for Qwen-VL-Chat to conduct generation ``` diff --git a/python/llm/example/GPU/PyTorch-Models/Model/replit/README.md b/python/llm/example/GPU/PyTorch-Models/Model/replit/README.md index 4938682aea2..3bfbf245655 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/replit/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/replit/README.md @@ -15,7 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install "transformers<4.35" +pip install transformers<=4.33.3 ``` #### 1.2 Installation on Windows @@ -26,6 +26,8 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers<=4.33.3 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/README.md b/python/llm/example/GPU/PyTorch-Models/Model/solar/README.md index 2b718cd4a6a..4d157d19bf3 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/README.md @@ -14,8 +14,6 @@ conda create -n llm python=3.11 conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by SOLAR ``` #### 1.2 Installation on Windows @@ -26,8 +24,6 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -pip install transformers==4.35.2 # required by SOLAR ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py index b612bc2fc7f..b328b51e133 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py @@ -49,7 +49,7 @@ # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. model = optimize_model(model) - model = model.to('xpu') + model = model.half().to('xpu') # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/README.md b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/README.md index 171ff392422..fd487a38dae 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/speech-t5/README.md @@ -15,7 +15,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install "datasets<2.18" soundfile # additional package required for SpeechT5 to conduct generation +pip install "datasets==2.16.1" soundfile # additional package required for SpeechT5 to conduct generation ``` #### 1.2 Installation on Windows @@ -27,7 +27,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -pip install "datasets<2.18" soundfile # additional package required for SpeechT5 to conduct generation +pip install "datasets==2.16.1" soundfile # additional package required for SpeechT5 to conduct generation ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/README.md b/python/llm/example/GPU/PyTorch-Models/Model/yi/README.md index b48b95325c3..2b500175575 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yi/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/README.md @@ -1,5 +1,5 @@ # Yi -In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) as a reference Yi model. +In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API on Yi models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) and [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat) as reference Yi models. ## 0. Requirements To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. @@ -112,7 +112,7 @@ python ./generate.py In the example, several arguments can be passed to satisfy your requirements: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B'`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Yi model (e.g. `01-ai/Yi-6B` and `01-ai/Yi-6B-Chat`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'01-ai/Yi-6B-Chat'`. - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. @@ -127,3 +127,13 @@ AI是什么? AI是什么? 人工智能(Artificial Intelligence),英文缩写为AI。它是研究、开发用于模拟、延伸和扩展人的智能的理论、方法、技术及 ``` + +#### [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI是什么? +人工智能(Artificial Intelligence, AI)是计算机科学的一个分支,它研究如何让计算机模拟人类的智能行为。人工智能可以通过模仿人类的思维过程和 +``` \ No newline at end of file diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py index 31256cda112..871f5f4fbd1 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yi/generate.py @@ -26,7 +26,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Yi model') - parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B", + parser.add_argument('--repo-id-or-model-path', type=str, default="01-ai/Yi-6B-Chat", help='The huggingface repo id for the Yi model to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="AI是什么?", diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 9e75a374d08..efc5aaf2468 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -1,5 +1,5 @@ -# Run Large Language Model on Intel NPU -In this directory, you will find examples on how you could apply IPEX-LLM INT4 or INT8 optimizations on LLM models on [Intel NPUs](../../../README.md). See the table blow for verified models. +# Run HuggingFace `transformers` Models on Intel NPU +In this directory, you will find examples on how to directly run HuggingFace `transformers` models on Intel NPUs (leveraging *Intel NPU Acceleration Library*). See the table blow for verified models. ## Verified Models @@ -9,7 +9,7 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | | Chatglm3 | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) | | Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) | -| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | +| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | | MiniCPM | [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | | Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | | Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) | @@ -23,27 +23,20 @@ Go to https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-w Then go to **Device Manager**, find **Neural Processors** -> **Intel(R) AI Boost**. Right click and select **Update Driver**. And then manually select the folder unzipped from the driver. -## Example: Predict Tokens using `generate()` API -In the example [generate.py](./generate.py), we show a basic use case for a Llama2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel NPUs. -### 1. Install -#### 1.1 Installation on Windows +## 1. Install +### 1.1 Installation on Windows We suggest using conda to manage environment: ```bash -conda create -n llm python=3.10 libuv +conda create -n llm python=3.10 conda activate llm -# below command will install intel_extension_for_pytorch==2.1.10+xpu as default -pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - -# below command will install intel_npu_acceleration_library -pip install intel-npu-acceleration-library==1.3 - -pip install transformers==4.40 +# install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] ``` -### 2. Runtime Configurations +## 2. Runtime Configurations For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. -#### 2.1 Configurations for Windows +### 2.1 Configurations for Windows > [!NOTE] > For optimal performance, we recommend running code in `conhost` rather than Windows Terminal: @@ -59,19 +52,20 @@ For optimal performance, it is recommended to set several environment variables. set BIGDL_USE_NPU=1 ``` -### 3. Running examples +## 3. Run Models +In the example [generate.py](./generate.py), we show a basic use case for a Llama2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel NPUs. ``` python ./generate.py ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`, and more verified models please see the list in [Verified Models](#verified-models). +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`, and more verified models please see the list in [Verified Models](#verified-models). - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. - `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. -#### Sample Output +### Sample Output #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) ```log @@ -81,3 +75,95 @@ Inference time: xxxx s -------------------------------------------------------------------------------- done ``` + +## 4. Run Optimized Models (Experimental) +The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including +- [Llama2-7B](./llama.py) +- [Llama3-8B](./llama.py) +- [Qwen2-1.5B](./qwen2.py) +- [Qwen2-7B](./qwen2.py) +- [MiniCPM-1B](./minicpm.py) +- [MiniCPM-2B](./minicpm.py) +- [Baichuan2-7B](./baichuan2.py) + +### Recommended NPU Driver Version for LNL Users +#### 32.0.100.2625 +Supported models: Llama2-7B, Qwen2-1.5B, Qwen2-7B, MiniCPM-1B, Baichuan2-7B +#### 32.0.101.2715 +Supported models: Llama3-8B, MiniCPM-2B + +### Run +```bash +# to run Llama-2-7b-chat-hf +python llama.py + +# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715) +python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct + +# to run Qwen2-1.5B-Instruct +python qwen2.py + +# to run Qwen2-7B-Instruct +python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct + +# to run MiniCPM-1B-sft-bf16 +python minicpm.py + +# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715) +python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 + +# to run Baichuan2-7B-Chat +python baichuan2.py +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (i.e. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. + +### Troubleshooting + +#### Output Problem +If you encounter output problem, please try to disable the optimization of transposing value cache with following command: +```bash +# to run Llama-2-7b-chat-hf +python  llama.py --disable-transpose-value-cache + +# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715) +python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache + +# to run Qwen2-1.5B-Instruct +python qwen2.py --disable-transpose-value-cache + +# to run MiniCPM-1B-sft-bf16 +python minicpm.py --disable-transpose-value-cache + +# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715) +python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache +``` + +#### Better Performance with High CPU Utilization +You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization. + + +### Sample Output +#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + +```log +Inference time: xxxx s +-------------------- Input -------------------- + [INST] <> + +<> + +What is AI? [/INST] +-------------------- Output -------------------- + [INST] <> + +<> + +What is AI? [/INST] AI (Artificial Intelligence) is a field of computer science and engineering that focuses on the development of intelligent machines that can perform tasks +``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py new file mode 100644 index 00000000000..f3f4cb109f6 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -0,0 +1,107 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +def get_prompt(message: str, chat_history: list[tuple[str, str]], + system_prompt: str) -> str: + texts = [f'[INST] <>\n{system_prompt}\n<>\n\n'] + # The first user input is _not_ stripped + do_strip = False + for user_input, response in chat_history: + user_input = user_input.strip() if do_strip else user_input + do_strip = True + texts.append(f'{user_input} [/INST] {response.strip()} [INST] ') + message = message.strip() if do_strip else message + texts.append(f'{message} [/INST]') + return ''.join(texts) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="baichuan-inc/Baichuan2-7B-Chat", + help="The huggingface repo id for the Baichuan2 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + DEFAULT_SYSTEM_PROMPT = """\ + """ + + print("-" * 80) + print("done") + with torch.inference_mode(): + print("finish to load") + for i in range(5): + prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) + _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py index 4a9a25ef92d..a3536ccc5e6 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py @@ -24,7 +24,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model') - parser.add_argument('--repo-id-or-model-path', type=str, default="D:\llm-models\Llama-2-7b-chat-hf", + parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", help='The huggingface repo id for the Llama2 model to be downloaded' ', or the path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun", @@ -40,7 +40,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, - load_in_low_bit=args.load_in_low_bit) + load_in_low_bit=args.load_in_low_bit, + attn_implementation="eager") print(model) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py new file mode 100644 index 00000000000..a808a551d5b --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py @@ -0,0 +1,107 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +def get_prompt(message: str, chat_history: list[tuple[str, str]], + system_prompt: str) -> str: + texts = [f'[INST] <>\n{system_prompt}\n<>\n\n'] + # The first user input is _not_ stripped + do_strip = False + for user_input, response in chat_history: + user_input = user_input.strip() if do_strip else user_input + do_strip = True + texts.append(f'{user_input} [/INST] {response.strip()} [INST] ') + message = message.strip() if do_strip else message + texts.append(f'{message} [/INST]') + return ''.join(texts) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="meta-llama/Llama-2-7b-chat-hf", + help="The huggingface repo id for the Llama2 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + DEFAULT_SYSTEM_PROMPT = """\ + """ + + print("-" * 80) + print("done") + with torch.inference_mode(): + print("finish to load") + for i in range(5): + prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) + _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py new file mode 100644 index 00000000000..a0d92a38611 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -0,0 +1,91 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="openbmb/MiniCPM-1B-sft-bf16", + help="The huggingface repo id for the Llama2 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + with torch.inference_mode(): + + print("finish to load") + for i in range(5): + _input_ids = tokenizer.encode("<用户>{}".format(args.prompt), return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py new file mode 100644 index 00000000000..2e4d195f80c --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py @@ -0,0 +1,95 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="Qwen/Qwen2-1.5B-Instruct", + help="The huggingface repo id for the Qwen2 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=None) + parser.add_argument("--inter-pp", type=int, default=None) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + messages = [{"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": args.prompt}] + text = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + with torch.inference_mode(): + print("finish to load") + for i in range(3): + _input_ids = tokenizer([text], return_tensors="pt").input_ids + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md index 6d1c1884b18..faacc0ae8d8 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md @@ -6,6 +6,8 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o | Model | Model Link | |------------|----------------------------------------------------------------| | Phi-3-Vision | [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | +| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) | +| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) | ## 0. Requirements To run these examples with IPEX-LLM on Intel NPUs, make sure to install the newest driver version of Intel NPU. @@ -22,13 +24,12 @@ We suggest using conda to manage environment: conda create -n llm python=3.10 libuv conda activate llm -# below command will install intel_extension_for_pytorch==2.1.10+xpu as default -pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +# install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] +pip install torchvision -# below command will install intel_npu_acceleration_library -pip install intel-npu-acceleration-library==1.3 - -pip install transformers==4.40 +# [optional] for MiniCPM-V-2_6 +pip install timm torch==2.1.2 torchvision==0.16.2 ``` ### 2. Runtime Configurations @@ -63,7 +64,7 @@ Arguments info: - `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. #### Sample Output -#### [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) +##### [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) ```log Inference time: xxxx s @@ -81,3 +82,38 @@ The sample input image is (which is fetched from [COCO dataset](https://cocodata +## 4. Run Optimized Models (Experimental) +The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including +- [MiniCPM-Llama3-V-2_5](./minicpm-llama3-v2.5.py) +- [MiniCPM-V-2_6](./minicpm_v_2_6.py) + +### Run +```bash +# to run MiniCPM-Llama3-V-2_5 +python minicpm-llama3-v2.5.py + +# to run MiniCPM-V-2_6 +python minicpm_v_2_6.py +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (i.e. `openbmb/MiniCPM-Llama3-V-2_5`) to be downloaded, or the path to the huggingface checkpoint folder. +- `image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be 'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is in the image?`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. + +#### Sample Output +##### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) + +```log +Inference time: xx.xx s +-------------------- Input -------------------- +http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg +-------------------- Prompt -------------------- +What is in this image? +-------------------- Output -------------------- +The image features a young child holding and showing off a white teddy bear wearing a pink dress. The background includes some red flowers and a stone wall, suggesting an outdoor setting. +``` \ No newline at end of file diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py new file mode 100644 index 00000000000..86b417b2496 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py @@ -0,0 +1,104 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModel, AutoModelForCausalLM +from transformers import AutoTokenizer +from transformers.utils import logging + +import requests +from PIL import Image + +logger = logging.get_logger(__name__) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `chat()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="openbmb/MiniCPM-Llama3-V-2_5", + help="The huggingface repo id for the MiniCPM-Llama3-V-2_5 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in the image?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=2) + parser.add_argument("--inter-pp", type=int, default=2) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float32, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + modules_to_not_convert=['vpm', 'resampler'] + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + print("-" * 80) + print("done") + + msgs = [{'role': 'user', 'content': args.prompt}] + image_path = args.image_url_or_path + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + st = time.time() + res = model.chat( + image=image, + msgs=msgs, + tokenizer=tokenizer, + sampling=True, + temperature=0.7, + # system_prompt='' # pass system_prompt if needed + ) + end = time.time() + + print(f'Inference time: {end-st} s') + print('-'*20, 'Input', '-'*20) + print(image_path) + print('-'*20, 'Prompt', '-'*20) + print(args.prompt) + output_str = res + print('-'*20, 'Output', '-'*20) + print(output_str) + + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py new file mode 100644 index 00000000000..259b8c122b1 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py @@ -0,0 +1,92 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import torch +import os +import time +import argparse +import requests +from PIL import Image +from ipex_llm.transformers.npu_model import AutoModel +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", + help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--image-url-or-path', type=str, + default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', + help='The URL or path to the image to infer') + parser.add_argument('--prompt', type=str, default="What is in this image?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-output-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--intra-pp", type=int, default=None) + parser.add_argument("--inter-pp", type=int, default=None) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + image_path = args.image_url_or_path + + model = AutoModel.from_pretrained(model_path, + torch_dtype=torch.float32, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_output_len=args.max_output_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + modules_to_not_convert=['vpm', 'resampler'] + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + model.eval() + + query = args.prompt + if os.path.exists(image_path): + image = Image.open(image_path).convert('RGB') + else: + image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB') + + # Generate predicted tokens + # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md + msg = [{'role': 'user', 'content': args.prompt}] + st = time.time() + with torch.inference_mode(): + res = model.chat( + image=image, + msgs=msg, + context=None, + tokenizer=tokenizer, + sampling=True, + ) + end = time.time() + print(f'Inference time: {end-st} s') + print('-'*20, 'Input', '-'*20) + print(image_path) + print('-'*20, 'Prompt', '-'*20) + print(args.prompt) + output_str = res + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/setup.py b/python/llm/setup.py index ecb7aea861b..1eca7b275e9 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -46,14 +46,14 @@ IPEX_LLM_PYTHON_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) VERSION = open(os.path.join(IPEX_LLM_PYTHON_HOME, './llm/version.txt'), 'r').read().strip() -CORE_XE_VERSION = VERSION.replace("2.1.0", "2.5.0") +CORE_XE_VERSION = VERSION.replace("2.2.0", "2.6.0") llm_home = os.path.join(os.path.dirname(os.path.abspath(__file__)), "src") github_artifact_dir = os.path.join(llm_home, '../llm-binary') libs_dir = os.path.join(llm_home, "ipex_llm", "libs") cpu_torch_version = ["torch==2.1.2+cpu;platform_system=='Linux'", "torch==2.1.2;platform_system=='Windows'"] CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error - 'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2', + 'transformers == 4.37.0', 'sentencepiece', 'tokenizers == 0.15.2', 'accelerate == 0.23.0', 'tabulate'] + cpu_torch_version SERVING_DEP = ['fschat[model_worker, webui] == 0.2.36', 'protobuf'] @@ -300,6 +300,12 @@ def setup_package(): serving_requires = ['py-cpuinfo'] serving_requires += SERVING_DEP + npu_requires = copy.deepcopy(all_requires) + cpu_transformers_version = ['transformers == 4.37.0', 'tokenizers == 0.15.2'] + for exclude_require in cpu_transformers_version: + npu_requires.remove(exclude_require) + npu_requires += ["transformers==4.40.0", + "bigdl-core-npu==" + CORE_XE_VERSION + ";platform_system=='Windows'"] metadata = dict( name='ipex_llm', @@ -323,6 +329,7 @@ def setup_package(): }, extras_require={"all": all_requires, "xpu": xpu_requires, # default to ipex 2.1 for linux and windows + "npu": npu_requires, "xpu-2-1": xpu_21_requires, "serving": serving_requires, "cpp": cpp_requires, diff --git a/python/llm/src/ipex_llm/serving/fastapi/api_server.py b/python/llm/src/ipex_llm/serving/fastapi/api_server.py index 88c856180e5..86fc6bce4ce 100644 --- a/python/llm/src/ipex_llm/serving/fastapi/api_server.py +++ b/python/llm/src/ipex_llm/serving/fastapi/api_server.py @@ -27,6 +27,8 @@ from typing import List, Optional, Union, Dict from fastapi.middleware.cors import CORSMiddleware from .tgi_protocol import Parameters +from typing_extensions import Literal +from fastapi import File, UploadFile, Form from .openai_protocol import ( ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, @@ -38,6 +40,8 @@ CompletionResponse, CompletionResponseStreamChoice, CompletionStreamResponse, + TranscriptionRequest, + TranscriptionResponse, ) result_dict: Dict[str, str] = {} @@ -50,6 +54,7 @@ class InputsRequest(BaseModel): image_list: Optional[list] = None stream: Optional[bool] = False req_type: str = 'completion' + transcription_request: Optional[TranscriptionRequest] = None class ChatCompletionRequest(BaseModel): @@ -92,20 +97,27 @@ class CompletionRequest(BaseModel): global tokenizer global local_model +global processor class FastApp(): - def __init__(self, model, mytokenizer): + def __init__(self, model, mytokenizer, myprocessor=None): global tokenizer global local_model + global processor local_model = model tokenizer = mytokenizer + processor = myprocessor self.app = app def get_queue_next_token(delta_text_queue): timeout = int(os.getenv("IPEX_LLM_FASTAPI_TIMEOUT", 60)) delta_text = delta_text_queue.text_queue.get(timeout=timeout) + if "whisper" in local_model.model_name.lower(): + if delta_text is not None and "<|" in delta_text and "|>" in delta_text: + import re + delta_text = re.sub(r'<\|.*?\|>', '', delta_text) if delta_text is None: remain = 0 else: @@ -125,6 +137,7 @@ async def chat_stream_generator(local_model, delta_text_queue, request_id): model_name = local_model.model_name index = 0 while True: + await asyncio.sleep(0) if not hasattr(delta_text_queue, 'empty'): delta_text, remain = get_queue_next_token(delta_text_queue) else: @@ -168,6 +181,7 @@ async def completion_stream_generator(local_model, delta_text_queue, request_id) model_name = local_model.model_name index = 0 while True: + await asyncio.sleep(0) if not hasattr(delta_text_queue, 'empty'): delta_text, remain = get_queue_next_token(delta_text_queue) else: @@ -275,11 +289,11 @@ def get_prompt(messages) -> str: if len(messages) <= 1: history = [] else: - history = [msg.model_dump() for msg in messages[:-1]] + history = [msg for msg in messages[:-1]] history.append({"role": "user", "content": query}) inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=False, return_tensors="pt", return_dict=False) - return inputs + return inputs, [] else: prompt = "" image_list = [] @@ -383,6 +397,32 @@ async def create_completion(request: CompletionRequest): return result +@app.post("/v1/audio/transcriptions") +async def transcriptions( + file: UploadFile=File(...), + model: Optional[str]=Form("default_model"), + language: Optional[str]=Form("zh"), + prompt: Optional[str]=Form(None), + response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]=Form(None), + temperature: Optional[float]=Form(None), + timestamp_granularities: Optional[List[Literal["word", "segment"]]]=Form(None) +): + file_path = "./" + file.filename + if not os.path.exists(file_path): + with open(file_path, "wb") as f: + f.write(await file.read()) + inputs_request = InputsRequest( + inputs="transcriptions", + parameters=None, + stream=False, + req_type="completion", + transcription_request=TranscriptionRequest(file=file_path, model=model, language=language) + ) + request_id, result = await generate(inputs_request) + rsp = TranscriptionResponse(text=result) + return rsp + + @app.on_event("startup") async def startup_event(): asyncio.create_task(process_requests(local_model, result_dict)) @@ -391,4 +431,4 @@ async def startup_event(): async def process_requests(local_model, result_dict): while True: await asyncio.sleep(0) - await local_model.process_step(tokenizer, result_dict) + await local_model.process_step(tokenizer, result_dict, processor) diff --git a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py index 3d8d75bfafa..9a7b2b0be11 100644 --- a/python/llm/src/ipex_llm/serving/fastapi/model_worker.py +++ b/python/llm/src/ipex_llm/serving/fastapi/model_worker.py @@ -23,37 +23,69 @@ class ModelWorker: - def __init__(self, checkpoint, low_bit, torch_dtype=torch.float16): + def __init__(self, checkpoint, low_bit, model_type="normal", torch_dtype=torch.float16): self.dtype = torch_dtype start = time.perf_counter() - model = self.load_model(checkpoint, low_bit) - from ipex_llm.utils import BenchmarkWrapper - self.model = BenchmarkWrapper(model, do_print=True) + if model_type == "audio": + self.model = self.load_model(checkpoint, low_bit, "audio") + else: + model = self.load_model(checkpoint, low_bit) + from ipex_llm.utils import BenchmarkWrapper + self.model = BenchmarkWrapper(model, do_print=True) end = time.perf_counter() logger.info(f"Time to load weights: {end - start:.2f}s") self.waiting_requests = asyncio.Queue() self.streamer = {} self.model_name = checkpoint - def load_model(self, model_path, low_bit='sym_int4'): - from ipex_llm.transformers import AutoModelForCausalLM, AutoModel - try: - model = AutoModelForCausalLM.from_pretrained(model_path, - load_in_low_bit=low_bit, - torch_dtype=self.dtype, - optimize_model=True, - trust_remote_code=True, - use_cache=True,) - except: - model = AutoModel.from_pretrained(model_path, - load_in_low_bit=low_bit, - torch_dtype=self.dtype, - optimize_model=True, - trust_remote_code=True, - use_cache=True,) + def load_model(self, model_path, low_bit='sym_int4', model_type="normal"): + if model_type == "audio": + from ipex_llm.transformers import AutoModelForSpeechSeq2Seq + model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, + load_in_low_bit=low_bit, + torch_dtype=self.dtype, + optimize_model=True, + trust_remote_code=True, + use_cache=True) + else: + from ipex_llm.transformers import AutoModelForCausalLM, AutoModel + try: + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_low_bit=low_bit, + torch_dtype=self.dtype, + optimize_model=True, + trust_remote_code=True, + use_cache=True,) + except: + model = AutoModel.from_pretrained(model_path, + load_in_low_bit=low_bit, + torch_dtype=self.dtype, + optimize_model=True, + trust_remote_code=True, + use_cache=True,) model = model.eval().to("xpu") return model + async def add_asr_request(self, processor): + if self.waiting_requests.empty(): + return + tmp_result = await self.waiting_requests.get() + request_id, request = tmp_result + transcription_request = request.transcription_request + forced_decoder_ids = processor.get_decoder_prompt_ids( + language=transcription_request.language, task="transcribe") + audio_path = transcription_request.file + import librosa + raw_speech, sampling_rate = librosa.load(audio_path, + sr=processor.feature_extractor.sampling_rate) + input_features = processor( + raw_speech, + sampling_rate=sampling_rate, + return_tensors="pt", + return_attention_mask=True, + ).input_features.to('xpu') + return input_features, forced_decoder_ids, request_id + async def add_request(self, tokenizer): if self.waiting_requests.empty(): return @@ -91,33 +123,41 @@ async def add_request(self, tokenizer): return input_ids, parameters, request_id, inputs_embeds @torch.no_grad() - async def process_step(self, tokenizer, result_dict): + async def process_step(self, tokenizer, result_dict, processor=None): if not self.waiting_requests.empty(): - input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer) - self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True) + if processor is not None and "whisper" in self.model_name.lower(): + input_features, decoder_ids, request_id = await self.add_asr_request(processor) + self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True) - def model_generate(): - generate_kwargs = {k: v for k, v in parameters.dict().items() if v is not None} - if "codegeex" in self.model_name.lower(): - eos_token_id = [tokenizer.eos_token_id, - tokenizer.convert_tokens_to_ids("<|user|>"), - tokenizer.convert_tokens_to_ids("<|observation|>")] - generate_kwargs["eos_token_id"] = eos_token_id - elif "internlm-xcomposer2-vl-7b" in self.model_name.lower(): - eos_token_id = [ - tokenizer.eos_token_id, - tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0] - ] - generate_kwargs["eos_token_id"] = eos_token_id - if input_ids is not None: - self.model.generate(input_ids, - streamer=self.streamer[request_id], **generate_kwargs) - elif inputs_embeds is not None: - self.model.generate(inputs_embeds=inputs_embeds, - streamer=self.streamer[request_id], **generate_kwargs) - torch.xpu.empty_cache() - torch.xpu.synchronize() + def model_generate(): + self.model.generate(input_features, + streamer=self.streamer[request_id], + forced_decoder_ids=decoder_ids) + else: + input_ids, parameters, request_id, inputs_embeds = await self.add_request(tokenizer) + self.streamer[request_id] = TextIteratorStreamer(tokenizer, skip_prompt=True) + def model_generate(): + generate_kwargs = {k: v for k, v in parameters.dict().items() if v is not None} + if "codegeex" in self.model_name.lower(): + eos_token_id = [tokenizer.eos_token_id, + tokenizer.convert_tokens_to_ids("<|user|>"), + tokenizer.convert_tokens_to_ids("<|observation|>")] + generate_kwargs["eos_token_id"] = eos_token_id + elif "internlm-xcomposer2-vl-7b" in self.model_name.lower(): + eos_token_id = [ + tokenizer.eos_token_id, + tokenizer.convert_tokens_to_ids(['[UNUSED_TOKEN_145]'])[0] + ] + generate_kwargs["eos_token_id"] = eos_token_id + if input_ids is not None: + self.model.generate(input_ids, + streamer=self.streamer[request_id], **generate_kwargs) + elif inputs_embeds is not None: + self.model.generate(inputs_embeds=inputs_embeds, + streamer=self.streamer[request_id], **generate_kwargs) + torch.xpu.empty_cache() + torch.xpu.synchronize() from threading import Thread t1 = Thread(target=model_generate) t1.start() diff --git a/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py b/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py index 1bc8f1e3a69..ca5963af1dd 100644 --- a/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py +++ b/python/llm/src/ipex_llm/serving/fastapi/openai_protocol.py @@ -24,6 +24,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Annotated from ipex_llm.utils.common import invalidInputError +from typing_extensions import Literal # from vllm.sampling_params import SamplingParams @@ -31,6 +32,20 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) +class TranscriptionRequest(BaseModel): + file: str = None + model: Optional[str] = "default_model" + language: Optional[str] = "zh" + prompt: Optional[str] = None + response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = None + temperature: Optional[float] = None + timestamp_granularities: Optional[List[Literal["word", "segment"]]] = None + + +class TranscriptionResponse(BaseModel): + text: str + + class OpenAIBaseModel(BaseModel): # OpenAI API does not allow extra fields model_config = ConfigDict(extra="forbid") diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 33c6b83da69..8996f8a3307 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -55,6 +55,7 @@ _IS_VLLM_AVAILABLE = None _USE_VLLM = False +_USE_VLLM_AWQ = False _VLLM_VERSION = None @@ -143,7 +144,7 @@ def is_linear_module(module): is_awq = is_auto_awq_available() and isinstance(module, WQLinear_GEMM) if is_vllm_available(): # Only convert vllm modules - global _VLLM_VERSION + global _VLLM_VERSION, _USE_VLLM_AWQ if _VLLM_VERSION is None: _VLLM_VERSION = get_package_version('vllm') from vllm.model_executor.layers.linear import ( @@ -180,6 +181,13 @@ def is_linear_module(module): out_features = module.output_size result = True mp_group = None + # Check for attribute qweight + if (not _USE_VLLM_AWQ + and hasattr(module.linear_method, "quant_config") + and module.linear_method.quant_config.get_name() == "awq"): + _USE_VLLM_AWQ = True + invalidInputError(module.skip_bias_add is not True, "Currently, ipex-vllm does not" + " support linear layers with skip_bias_add argument") if isinstance(module, RowParallelLinear) and tp_size >= 2: mp_group = get_tensor_model_parallel_group() in_features = module.input_size_per_partition @@ -218,6 +226,131 @@ def is_linear_module(module): return result, (in_features, out_features, mp_group) +def convert_vllm(module, qtype, in_features, out_features, mp_group, cur_qtype, + enable_xetla, optimize_lm_head, enable_scale_search): + from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead + from ipex_llm.transformers.low_bit_linear import LowBitLinear, \ + FP16Linear, BF16Linear, vLLMLowBitLinear, vLLMFP16Linear, vLLMBF16Linear + # Currently, vLLM does not support optimize_lm_head = True + optimize_lm_head = False + if isinstance(module, ParallelLMHead): + if qtype == ggml_tensor_qtype["fp16"]: + new_linear = FP16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) + elif qtype == ggml_tensor_qtype["bf16"]: + new_linear = BF16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) + else: + new_linear = LowBitLinear( + in_features, + out_features, + cur_qtype, + module.bias is not None, + mp_group=mp_group, + enable_xetla=enable_xetla, + optimize_lm_head=optimize_lm_head, + enable_scale_search=enable_scale_search, + ) + else: + if qtype == ggml_tensor_qtype["fp16"]: + new_linear = vLLMFP16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) + elif qtype == ggml_tensor_qtype["bf16"]: + new_linear = vLLMBF16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) + else: + new_linear = vLLMLowBitLinear( + in_features, + out_features, + cur_qtype, + module.bias is not None, + mp_group=mp_group, + enable_xetla=enable_xetla, + optimize_lm_head=optimize_lm_head, + enable_scale_search=enable_scale_search, + ) + return new_linear + + +def convert_vllm_awq(module): + from ipex_llm.transformers.low_bit_linear import get_block_size + Q4_1 = get_block_size("asym_int4") + + scales = module.scales + wf = (torch.tensor([0, 4, 1, 5, 2, 6, 3, 7], + dtype=torch.int32) * 4).unsqueeze(0) + # vLLM only supports load 4-bits model, so this has been checked + bits = 4 + group_size = module.linear_method.quant_config.group_size + + zeros = torch.bitwise_right_shift( + torch.unsqueeze(module.qzeros, 2).expand(-1, -1, 32 // bits), + wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8) + zeros = torch.bitwise_and(zeros, (2 ** bits) - 1) + + g_id_map = None + + zeros = zeros.reshape(scales.shape) + + weight = torch.bitwise_right_shift( + torch.unsqueeze(module.qweight, 2).expand(-1, -1, 32 // bits), + wf.unsqueeze(0)).to(torch.int16 if bits == 8 else torch.int8) + weight = torch.bitwise_and(weight, (2 ** bits) - 1) + weight = weight.reshape(weight.shape[0], weight.shape[1] * weight.shape[2]) + + # convert weight to ggml format + weight = weight.reshape(weight.shape[0]//group_size, group_size, weight.shape[1]) + weight = weight.permute(2, 0, 1).reshape(weight.shape[2], -1, 2, Q4_1//2) + weight = weight.transpose(2, 3) + weight = torch.bitwise_left_shift(weight, + torch.tensor([0, 4], dtype=torch.int8).reshape(1, 1, 1, 2)) + weight = torch.bitwise_or(weight[:, :, :, 0], weight[:, :, :, 1]).contiguous() + + # convert zeros to ggml format + zeros = zeros.reshape(-1, 1, zeros.shape[1]).permute(2, 0, 1)\ + .unsqueeze(2)\ + .expand(-1, -1, group_size//Q4_1, -1)\ + .reshape(zeros.shape[1], -1, 1)\ + .contiguous().to(torch.float16) + + # convert scales to ggml format + scales = scales.reshape(-1, 1, scales.shape[1]).permute(2, 0, 1)\ + .unsqueeze(2)\ + .expand(-1, -1, group_size//Q4_1, -1)\ + .reshape(scales.shape[-1], -1, 1)\ + .contiguous().to(torch.float16) + + m = -(zeros * scales) + d = scales + + ggml_weight = torch.cat([d.view(torch.uint8), + m.view(torch.uint8), + weight.view(torch.uint8)], dim=-1) + ggml_weight = ggml_weight.reshape([-1]) + + return ggml_weight, g_id_map + + def convert_gptq(module, awq=False, llm_awq=False, act_order=False): from ipex_llm.transformers.low_bit_linear import get_block_size Q4_1 = get_block_size("asym_int4") @@ -323,6 +456,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, FP16Linear, BF16Linear from ipex_llm.transformers.embedding import CPUEmbedding, DiskEmbedding, LowBitEmbedding has_been_replaced = False + global _USE_VLLM_AWQ for name, module in model.named_children(): is_linear, linear_args = is_linear_module(module) @@ -334,14 +468,16 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, if is_linear and not isinstance(module, LowBitLinear): in_features, out_features, mp_group = linear_args - optimize_lm_head = False - if is_lm_head(name, model_config, out_features): - model_type = getattr(model_config, "model_type", None) - if model_type in ["gptj", "llama", "qwen2"]: - if os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) is not None: - optimize_lm_head = os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "1" - elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None: - optimize_lm_head = os.environ.get("IPEX_LLM_LOW_MEM", None) == "1" + optimize_lm_head = ( + is_lm_head(name, model_config, out_features) + and ( + not os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "0" + ) + and ( + not (getattr(model_config, "model_type", "") == "baichuan" and + model.config.hidden_size == 5120) # except baichuan2-13B + ) + ) with init_empty_weights(): new_linear = None is_gptq = is_gptq_linear(module) @@ -382,6 +518,70 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, if has_bias: new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ .to(device) + elif _USE_VLLM_AWQ: + # User load an AWQ quantized model from vLLM + from ipex_llm.transformers.low_bit_linear import vLLMLowBitLinear + from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead + has_bias = module.bias is not None and module.bias.abs().sum() != 0 + if isinstance(module, ParallelLMHead): + new_linear = LowBitLinear( + in_features, + out_features, + qtype=qtype, + bias=has_bias, + mp_group=mp_group, + enable_xetla=enable_xetla, + optimize_lm_head=False, + act_order=act_order, + enable_scale_search=enable_scale_search, + ) + device = module.weight.data.device + cur_qtype, cur_imatrix = get_cur_qtype_and_imatrix(qtype, + full_module_name, + imatrix_data, + model_config) + # Copy the weights + paramsLowBit = FP4Params(data=module.weight.data, + requires_grad=False, + quantized=False, + _shape=None, + convert_shape_only=convert_shape_only, + qtype=cur_qtype, + imatrix=cur_imatrix, + in_features=in_features, + enable_xetla=enable_xetla, + enable_scale_search=enable_scale_search).to(device) + else: + new_linear = vLLMLowBitLinear( + in_features, + out_features, + qtype=qtype, + bias=has_bias, + mp_group=mp_group, + enable_xetla=enable_xetla, + optimize_lm_head=False, + act_order=act_order, + enable_scale_search=enable_scale_search, + ) + device = module.qweight.data.device + invalidInputError(device.type != "meta", + "converting from meta device is not supported") + weight, g_idx_map = convert_vllm_awq(module) + if act_order: + new_linear.g_idx_map = g_idx_map + # Copy the weights + paramsLowBit = FP4Params(data=weight, + requires_grad=False, + quantized=True, + _shape=(out_features, in_features), + convert_shape_only=convert_shape_only, + qtype=qtype, + enable_xetla=enable_xetla, + enable_scale_search=enable_scale_search).to(device) + new_linear._parameters['weight'] = paramsLowBit + if has_bias: + new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ + .to(device) elif qtype not in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]: if in_features % 64 != 0: # now our kernel requires in_features is a multiple of 64 @@ -399,16 +599,27 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, # check hidden size whether is a multiple of 256 cur_qtype = check_hidden_size(cur_qtype, in_features) - new_linear = LowBitLinear( - in_features, - out_features, - cur_qtype, - module.bias is not None, - mp_group=mp_group, - enable_xetla=enable_xetla, - optimize_lm_head=optimize_lm_head, - enable_scale_search=enable_scale_search, - ) + if _USE_VLLM: + new_linear = convert_vllm(module, + qtype, + in_features, + out_features, + mp_group, + cur_qtype, + enable_xetla, + optimize_lm_head, + enable_scale_search) + else: + new_linear = LowBitLinear( + in_features, + out_features, + cur_qtype, + module.bias is not None, + mp_group=mp_group, + enable_xetla=enable_xetla, + optimize_lm_head=optimize_lm_head, + enable_scale_search=enable_scale_search, + ) device = module.weight.data.device # Copy the weights paramsLowBit = FP4Params(data=module.weight.data, @@ -427,13 +638,26 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, .to(device) elif qtype == ggml_tensor_qtype["fp16"]: module.to(torch.float16) - new_linear = FP16Linear( - in_features, - out_features, - module.bias is not None, - mp_group=mp_group, - optimize_lm_head=optimize_lm_head - ) + if _USE_VLLM: + new_linear = convert_vllm( + module, + qtype, + in_features, + out_features, + mp_group, + None, + None, + optimize_lm_head, + None + ) + else: + new_linear = FP16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) device = module.weight.data.device from ipex_llm.transformers.utils import get_ipex_version if get_ipex_version() < "2.1.10+xpu": @@ -449,13 +673,26 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, .to(device) elif qtype == ggml_tensor_qtype["bf16"]: module.to(torch.bfloat16) - new_linear = BF16Linear( - in_features, - out_features, - module.bias is not None, - mp_group=mp_group, - optimize_lm_head=optimize_lm_head - ) + if _USE_VLLM: + new_linear = convert_vllm( + module, + qtype, + in_features, + out_features, + mp_group, + None, + None, + optimize_lm_head, + None + ) + else: + new_linear = BF16Linear( + in_features, + out_features, + module.bias is not None, + mp_group=mp_group, + optimize_lm_head=optimize_lm_head + ) device = module.weight.data.device # convert here new_linear._parameters['weight'] = nn.Parameter(module.weight) @@ -727,6 +964,9 @@ def _optimize_pre(model, qtype=None): if model.config.model_type == "qwen2_moe": from ipex_llm.transformers.models.qwen2_moe import merge_qkv model.apply(merge_qkv) + if model.config.model_type == "qwen2_audio": + from ipex_llm.transformers.models.qwen2 import merge_qkv + model.language_model.apply(merge_qkv) if model.config.model_type == "stablelm": # For stablelm-zephyr-3b and stablelm-2-zephyr-1_6b from ipex_llm.transformers.models.stablelm import merge_qkv @@ -747,11 +987,20 @@ def _optimize_pre(model, qtype=None): if model.config.model_type == "llama": from ipex_llm.transformers.models.llama import merge_qkv model.apply(merge_qkv) + if model.config.model_type == "minicpm": + from ipex_llm.transformers.models.minicpm import merge_qkv + model.apply(merge_qkv) if model.config.model_type == "minicpmv": - if model.config.hidden_size == 3584 and model.config.vocab_size == 151666: + from ipex_llm.transformers.models.minicpmv import merge_qkv + model.vpm.apply(merge_qkv) + if model.config.hidden_size == 2304 and model.config.vocab_size == 122753: + model.llm.config.model_type = "minicpm" + elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666: model.llm.config.model_type = "qwen2" - _optimize_pre(model.llm, qtype=qtype) - model.llm.config.model_type = "minicpmv" + elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256: + model.llm.config.model_type = "llama" + _optimize_pre(model.llm, qtype=qtype) + model.llm.config.model_type = "minicpmv" return model @@ -931,16 +1180,6 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]): def _optimize_post(model, lightweight_bmm=False): - from packaging import version - from ipex_llm.transformers.models.llama import llama_attention_forward_4_31 - from ipex_llm.transformers.models.llama import llama_attention_selective_batching_forward_4_31 - from ipex_llm.transformers.models.llama import llama_model_selective_batching_forward_4_31 - from ipex_llm.transformers.models.llama import llama_rms_norm_forward - from ipex_llm.transformers.models.llama import llama_mlp_forward - from ipex_llm.transformers.models.llama import llama_decoder_forward - from ipex_llm.transformers.models.llama import llama_model_forward - from transformers.modeling_utils import PreTrainedModel - try: from sentence_transformers.SentenceTransformer import SentenceTransformer if isinstance(model, SentenceTransformer): @@ -959,110 +1198,80 @@ def _optimize_post(model, lightweight_bmm=False): except ModuleNotFoundError: pass + from transformers.modeling_utils import PreTrainedModel # All huggingface format models are inherited from `PreTrainedModel` if not isinstance(model, PreTrainedModel): logger.info("Only HuggingFace Transformers models are currently " "supported for further optimizations") return model - vllm_selective_batching = os.getenv("VLLM_ENABLE_SELECTIVE_BATCHING") - enable_vllm_se_batching = vllm_selective_batching is not None - enable_vllm_se_batching = enable_vllm_se_batching and vllm_selective_batching.lower() == "true" - + from packaging import version trans_version = transformers.__version__ - if version.parse(trans_version) >= version.parse("4.31.0"): - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaRMSNorm, - llama_rms_norm_forward,) - convert_forward(model, - transformers.models.llama.modeling_llama.LlamaMLP, - llama_mlp_forward) - convert_forward(model, - transformers.models.llama.modeling_llama.LlamaDecoderLayer, - llama_decoder_forward) + # convert all nn.LayerNorm + from ipex_llm.transformers.models.bloom import bloom_layer_norm_forward + convert_forward(model, + nn.LayerNorm, + bloom_layer_norm_forward) + from ipex_llm.transformers.models.llama import llama_rms_norm_forward + from ipex_llm.transformers.models.llama import llama_mlp_forward + + if model.config.model_type == "llama": + from transformers.models.llama.modeling_llama import LlamaRMSNorm + from transformers.models.llama.modeling_llama import LlamaMLP + from transformers.models.llama.modeling_llama import LlamaAttention + from transformers.models.llama.modeling_llama import LlamaDecoderLayer + from transformers.models.llama.modeling_llama import LlamaModel if version.parse(trans_version) >= version.parse("4.36.0"): - # transformers version >= 4.36.0 + from transformers.models.llama.modeling_llama import LlamaSdpaAttention + + from ipex_llm.transformers.models.llama import llama_rms_norm_forward + from ipex_llm.transformers.models.llama import llama_mlp_forward + from ipex_llm.transformers.models.llama import llama_decoder_forward + + convert_forward(model, LlamaRMSNorm, llama_rms_norm_forward) + convert_forward(model, LlamaMLP, llama_mlp_forward) + convert_forward(model, LlamaDecoderLayer, llama_decoder_forward) + + if version.parse(trans_version) >= version.parse("4.41.0"): + from ipex_llm.transformers.models.llama import llama_model_forward_4_41 + from ipex_llm.transformers.models.llama import llama_attention_forward_4_41 + convert_forward(model, LlamaModel, llama_model_forward_4_41) + convert_forward(model, LlamaAttention, llama_attention_forward_4_41) + convert_forward(model, LlamaSdpaAttention, llama_attention_forward_4_41) + elif version.parse(trans_version) >= version.parse("4.38.0"): + from ipex_llm.transformers.models.llama import llama_model_forward_4_38 from ipex_llm.transformers.models.llama import llama_attention_forward_4_38 - if version.parse(trans_version) >= version.parse("4.38.0"): - if version.parse(trans_version) >= version.parse("4.41.0"): - from ipex_llm.transformers.models.llama import llama_model_forward_4_41 - from ipex_llm.transformers.models.llama import llama_attention_forward_4_41 - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaModel, - llama_model_forward_4_41) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaAttention, - llama_attention_forward_4_41) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaSdpaAttention, - llama_attention_forward_4_41) - else: - from ipex_llm.transformers.models.llama import llama_model_forward_4_38 - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaModel, - llama_model_forward_4_38) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaAttention, - llama_attention_forward_4_38) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaSdpaAttention, - llama_attention_forward_4_38) - else: - from ipex_llm.transformers.models.llama import llama_model_forward_4_36 - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaModel, - llama_model_forward_4_36) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaAttention, - llama_attention_forward_4_38) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaSdpaAttention, - llama_attention_forward_4_38) + convert_forward(model, LlamaModel, llama_model_forward_4_38) + convert_forward(model, LlamaAttention, llama_attention_forward_4_38) + convert_forward(model, LlamaSdpaAttention, llama_attention_forward_4_38) + elif version.parse(trans_version) >= version.parse("4.36.0"): + from ipex_llm.transformers.models.llama import llama_model_forward_4_36 + from ipex_llm.transformers.models.llama import llama_attention_forward_4_38 + convert_forward(model, LlamaModel, llama_model_forward_4_36) + convert_forward(model, LlamaAttention, llama_attention_forward_4_38) + convert_forward(model, LlamaSdpaAttention, llama_attention_forward_4_38) else: - # transformers version between 4.31.0 - 4.35.2 - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaAttention, - llama_attention_forward_4_31, ) - if enable_vllm_se_batching: - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaModel, + vllm_se_batching = os.getenv("VLLM_ENABLE_SELECTIVE_BATCHING", "").lower() == "true" + if vllm_se_batching: + from ipex_llm.transformers.models.llama import ( llama_model_selective_batching_forward_4_31, - ) - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaAttention, llama_attention_selective_batching_forward_4_31, ) + convert_forward(model, LlamaModel, + llama_model_selective_batching_forward_4_31) + convert_forward(model, LlamaAttention, + llama_attention_selective_batching_forward_4_31) else: - convert_forward( - model, - transformers.models.llama.modeling_llama.LlamaModel, - llama_model_forward) - else: - # todo implement 4.28.0 ~ 4.30.2 - pass - - # convert all nn.LayerNorm - from ipex_llm.transformers.models.bloom import bloom_layer_norm_forward - convert_forward(model, - nn.LayerNorm, - bloom_layer_norm_forward) - - if model.config.architectures is not None \ - and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"]: + from ipex_llm.transformers.models.llama import llama_model_forward + from ipex_llm.transformers.models.llama import llama_attention_forward_4_31 + convert_forward(model, LlamaModel, llama_model_forward) + convert_forward(model, LlamaAttention, llama_attention_forward_4_31) + + elif ( + model.config.architectures is not None + and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"] + ): if hasattr(model.config, 'padded_vocab_size') and \ model.config.padded_vocab_size in [65024, 64896]: # chatglm2-6b, chatglm2-6b-32k, chatglm3-6b, chatglm3-6b-32k, chatglm3-6b-128k @@ -1219,8 +1428,17 @@ def _optimize_post(model, lightweight_bmm=False): if model.config.hidden_size in [4096, 2048]: # baichuan-7B and baichuan2-7B from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_7b + from ipex_llm.transformers.models.baichuan import baichuan_model_7b_forward + for i in range(len(model.model.layers)): + setattr(model.model.layers[i].self_attn, "layer_idx", i) convert_forward(model, module.Attention, baichuan_attention_forward_7b) convert_forward(model, module.RMSNorm, llama_rms_norm_forward) + if model.config.vocab_size == 125696: + # baichuan2-7B + convert_forward(model, module.BaichuanModel, baichuan_model_7b_forward) + elif model.config.vocab_size == 64000: + # baichuan-7B + convert_forward(model, module.Model, baichuan_model_7b_forward) elif model.config.hidden_size == 5120: # baichuan-13B and baichuan2-13B from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b @@ -1342,9 +1560,6 @@ def _optimize_post(model, lightweight_bmm=False): from ipex_llm.transformers.models.qwen2 import qwen2_attention_forward from ipex_llm.transformers.models.qwen2 import qwen2_causal_lm_forward from ipex_llm.transformers.models.qwen2 import qwen2_mlp_forward - convert_forward(model, - module.Qwen2Model, - qwen2_model_forward) convert_forward(model, module.Qwen2ForCausalLM, qwen2_causal_lm_forward) @@ -1360,6 +1575,12 @@ def _optimize_post(model, lightweight_bmm=False): convert_forward(model, module.Qwen2SdpaAttention, qwen2_attention_forward) + if version.parse(trans_version) >= version.parse("4.42"): + from ipex_llm.transformers.models.qwen2 import qwen2_model_forward_4_42 + convert_forward(model, module.Qwen2Model, qwen2_model_forward_4_42) + else: + from ipex_llm.transformers.models.qwen2 import qwen2_model_forward + convert_forward(model, module.Qwen2Model, qwen2_model_forward) elif model.config.model_type == "qwen2_moe": # for Qwen1.5-MOE-A2.7B modeling_module_name = model.__class__.__module__ @@ -1368,6 +1589,7 @@ def _optimize_post(model, lightweight_bmm=False): from ipex_llm.transformers.models.qwen2_moe import qwen2moe_model_forward from ipex_llm.transformers.models.qwen2_moe import qwen2_moe_causal_lm_forward from ipex_llm.transformers.models.qwen2 import qwen2_attention_forward + from ipex_llm.transformers.models.qwen2 import qwen2_mlp_forward convert_forward(model, module.Qwen2MoeModel, qwen2moe_model_forward) @@ -1382,13 +1604,15 @@ def _optimize_post(model, lightweight_bmm=False): qwen2moe_moeblock_forward) convert_forward(model, module.Qwen2MoeMLP, - llama_mlp_forward) + qwen2_mlp_forward) convert_forward(model, module.Qwen2MoeAttention, qwen2_attention_forward) convert_forward(model, module.Qwen2MoeSdpaAttention, qwen2_attention_forward) + elif model.config.model_type == "qwen2_audio": + _optimize_post(model.language_model, lightweight_bmm=lightweight_bmm) elif model.config.model_type == "cohere": # for CohereForAI/c4ai-command-r-v01 invalidInputError(version.parse(trans_version) >= version.parse("4.40.0"), @@ -1727,40 +1951,63 @@ def safe_bmm_fwd(*args, **kwargs): module.StableLmModel, stablelm_model_forward ) - elif model.config.model_type == 'minicpm': + elif model.config.model_type == "minicpm": modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) - if version.parse(trans_version) >= version.parse("4.39.0"): - from ipex_llm.transformers.models.minicpm import minicpm_attention_forward_4_39 - convert_forward(model, - module.MiniCPMAttention, - minicpm_attention_forward_4_39) - else: - from ipex_llm.transformers.models.minicpm import minicpm_attention_forward - convert_forward(model, - module.MiniCPMAttention, - minicpm_attention_forward) - from ipex_llm.transformers.models.minicpm import minicpm_model_forward - - convert_forward(model, - module.MiniCPMMLP, - llama_mlp_forward) - convert_forward(model, - module.MiniCPMRMSNorm, - llama_rms_norm_forward) - - convert_forward(model, - module.MiniCPMModel, - minicpm_model_forward) + from ipex_llm.transformers.models.minicpm import minicpm_attention_forward + from ipex_llm.transformers.models.minicpm import minicpm_model_forward_wrapper + convert_forward(model, module.MiniCPMAttention, minicpm_attention_forward) + convert_forward(model, module.MiniCPMMLP, llama_mlp_forward) + convert_forward(model, module.MiniCPMRMSNorm, llama_rms_norm_forward) + minicpm_model_forward = minicpm_model_forward_wrapper(module.MiniCPMModel.forward) + convert_forward(model, module.MiniCPMModel, minicpm_model_forward) elif model.config.model_type == "minicpmv": - if model.config.hidden_size == 3584 and model.config.vocab_size == 151666: - model.llm.config.model_type = "qwen2" - _optimize_post(model.llm, lightweight_bmm=lightweight_bmm) - model.llm.config.model_type = "minicpmv" modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from ipex_llm.transformers.models.minicpmv import minicpmv_generate_wrapper minicpmv_generate = minicpmv_generate_wrapper(module.MiniCPMV.generate) model.generate = MethodType(minicpmv_generate, model) + if model.config.hidden_size == 2304 and model.config.vocab_size == 122753: + # MiniCPM-V 2 + model.llm.config.model_type = "minicpm" + elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666: + # MiniCPM-V 2.6 + model.llm.config.model_type = "qwen2" + elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256: + # MiniCPM-V 2.5 + model.llm.config.model_type = "llama" + _optimize_post(model.llm, lightweight_bmm=lightweight_bmm) + model.llm.config.model_type = "minicpmv" + + vpm_modeling_module_name = model.vpm.__class__.__module__ + vpm_module = importlib.import_module(vpm_modeling_module_name) + if not hasattr(model.vpm, "config"): + # MiniCPM-V 2 + from ipex_llm.transformers.models.minicpmv import vision_transformer_attention_forward + from ipex_llm.transformers.models.minicpmv import minicpmv_get_vision_embedding + convert_forward(model.vpm, vpm_module.Attention, vision_transformer_attention_forward) + model.get_vision_embedding = MethodType(minicpmv_get_vision_embedding, model) + elif "siglip" in model.vpm.config.model_type: + # MiniCPM-V 2.6 + from ipex_llm.transformers.models.minicpmv import siglip_attention_forward + convert_forward(model.vpm, vpm_module.SiglipAttention, siglip_attention_forward) + + from ipex_llm.transformers.models.minicpmv import _in_projection_packed + resampler_module_name = model.resampler.__class__.__module__ + resampler_module = importlib.import_module(resampler_module_name) + resampler_module._in_projection_packed = _in_projection_packed + + # for minicpm-v-2_6 benchmarking purposes + from ipex_llm.transformers.models.minicpmv import minicpmv_decode_stream_wrapper + minicpmv_decode_stream = minicpmv_decode_stream_wrapper(module.MiniCPMV._decode_stream) + model._decode_stream = MethodType(minicpmv_decode_stream, model) + elif model.vpm.config.model_type == "idefics2": + # MiniCPM-V 2.5 + from ipex_llm.transformers.models.minicpmv import siglip_attention_forward + from ipex_llm.transformers.models.minicpmv import minicpmv_chat_wrapper + convert_forward(model.vpm, vpm_module.Idefics2VisionAttention, siglip_attention_forward) + minicpmv_chat = minicpmv_chat_wrapper(module.MiniCPMV.chat) + model.chat = MethodType(minicpmv_chat, model) + return model diff --git a/python/llm/src/ipex_llm/transformers/kv.py b/python/llm/src/ipex_llm/transformers/kv.py index 1543ab34d5e..8b20f546893 100644 --- a/python/llm/src/ipex_llm/transformers/kv.py +++ b/python/llm/src/ipex_llm/transformers/kv.py @@ -121,6 +121,21 @@ def update( return self.key_cache[layer_idx], self.value_cache[layer_idx] + @classmethod + def from_reserved(cls, layers: int, + bsz: int, n_head: int, length: int, head_dim: int, + dtype: torch.dtype, device: torch.device): + past_key_values = cls() + for _i in range(layers): + k_cache, v_cache = init_kv_cache( + bsz, n_head, head_dim, + 0, length + cls.KV_ALLOC_BLOCK_LENGTH, + dtype, device + ) + past_key_values.key_cache.append(k_cache) + past_key_values.value_cache.append(v_cache) + return past_key_values + # Copied from transformers.models.llama.modeling_llama.repeat_kv def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -155,60 +170,67 @@ def compress_kv(attn_config, key_states, query_states, value_states, attention_m if q_len <= attn_config.max_capacity_prompt: return key_states, value_states else: - key_states_expand = repeat_kv(key_states, num_key_value_groups).to(key_states.device) - attn_weights = torch.matmul(query_states[..., -attn_config.window_size:, :], - key_states_expand.transpose(2, 3)) / math.sqrt(head_dim) - mask = torch.full((attn_config.window_size, attn_config.window_size), - torch.finfo(attn_weights.dtype).min, - device=attn_weights.device) - mask_cond = torch.arange(mask.size(-1), device=attn_weights.device) - mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) - mask = mask.to(attn_weights.device) - attention_mask = mask[None, None, :, :] - - attn_weights[:, :, -attn_config.window_size:, -attn_config.window_size:] += attention_mask - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_weights_sum = attn_weights[:, :, -attn_config.window_size:, - :-attn_config.window_size].sum(dim=-2) - if attn_config.pooling == 'avgpool': - if num_key_value_groups > 1: - attn_cache = F.avg_pool2d(attn_weights_sum, kernel_size=(num_key_value_groups, - attn_config.kernel_size), - padding=(0, attn_config.kernel_size//2), - stride=(num_key_value_groups, 1)) - else: - attn_cache = F.avg_pool1d(attn_weights_sum, kernel_size=attn_config.kernel_size, - padding=attn_config.kernel_size//2, stride=1) - elif attn_config.pooling == 'maxpool': - if num_key_value_groups > 1: - attn_cache = F.max_pool2d(attn_weights_sum, - kernel_size=(num_key_value_groups, - attn_config.kernel_size), - padding=(0, attn_config.kernel_size//2), - stride=(num_key_value_groups, 1)) - else: - attn_cache = F.max_pool1d(attn_weights_sum, kernel_size=attn_config.kernel_size, - padding=attn_config.kernel_size//2, stride=1) + sliding_window_size = getattr(attn_config, "sliding_window", None) + if sliding_window_size is not None and sliding_window_size <= 2500: + return key_states[:, :, -sliding_window_size:, :], \ + value_states[:, :, -sliding_window_size:, :] else: - invalidInputError(False, 'Pooling method not supported') - indices = attn_cache.topk(attn_config.max_capacity_prompt - attn_config.window_size, - dim=-1).indices - indices = indices.unsqueeze(-1).expand(-1, -1, -1, head_dim) - k_past_compress = key_states[:, :, :-attn_config.window_size, :].gather(dim=2, - index=indices) - v_past_compress = value_states[:, :, :-attn_config.window_size, :].gather(dim=2, - index=indices) - k_cur = key_states[:, :, -attn_config.window_size:, :] - v_cur = value_states[:, :, -attn_config.window_size:, :] - key_states = torch.cat([k_past_compress, k_cur], dim=2) - value_states = torch.cat([v_past_compress, v_cur], dim=2) - return key_states, value_states + key_states_expand = repeat_kv(key_states, num_key_value_groups).to(key_states.device) + attn_weights = torch.matmul(query_states[..., -attn_config.window_size:, :], + key_states_expand.transpose(2, 3)) / math.sqrt(head_dim) + mask = torch.full((attn_config.window_size, attn_config.window_size), + torch.finfo(attn_weights.dtype).min, + device=attn_weights.device) + mask_cond = torch.arange(mask.size(-1), device=attn_weights.device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(attn_weights.device) + attention_mask = mask[None, None, :, :] + + attn_weights[:, :, -attn_config.window_size:, + -attn_config.window_size:] += attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(query_states.dtype) + attn_weights_sum = attn_weights[:, :, -attn_config.window_size:, + :-attn_config.window_size].sum(dim=-2) + if attn_config.pooling == 'avgpool': + if num_key_value_groups > 1: + attn_cache = F.avg_pool2d(attn_weights_sum, + kernel_size=(num_key_value_groups, + attn_config.kernel_size), + padding=(0, attn_config.kernel_size//2), + stride=(num_key_value_groups, 1)) + else: + attn_cache = F.avg_pool1d(attn_weights_sum, kernel_size=attn_config.kernel_size, + padding=attn_config.kernel_size//2, stride=1) + elif attn_config.pooling == 'maxpool': + if num_key_value_groups > 1: + attn_cache = F.max_pool2d(attn_weights_sum, + kernel_size=(num_key_value_groups, + attn_config.kernel_size), + padding=(0, attn_config.kernel_size//2), + stride=(num_key_value_groups, 1)) + else: + attn_cache = F.max_pool1d(attn_weights_sum, kernel_size=attn_config.kernel_size, + padding=attn_config.kernel_size//2, stride=1) + else: + invalidInputError(False, 'Pooling method not supported') + indices = attn_cache.topk(attn_config.max_capacity_prompt - attn_config.window_size, + dim=-1).indices + indices = indices.unsqueeze(-1).expand(-1, -1, -1, head_dim) + k_past_compress = key_states[:, :, :-attn_config.window_size, :]\ + .gather(dim=2, index=indices) + v_past_compress = value_states[:, :, :-attn_config.window_size, :]\ + .gather(dim=2, index=indices) + k_cur = key_states[:, :, -attn_config.window_size:, :] + v_cur = value_states[:, :, -attn_config.window_size:, :] + key_states = torch.cat([k_past_compress, k_cur], dim=2) + value_states = torch.cat([v_past_compress, v_cur], dim=2) + return key_states, value_states class DynamicCompressCache(DynamicCache): - def __init__(self, *args, **kwargs): + def __init__(self, quant_kv=False, *args, **kwargs): super().__init__(*args, **kwargs) self.real_kv_len = 0 @@ -257,8 +279,6 @@ def update( value_states=value_states, attention_mask=attention_mask, num_key_value_groups=num_key_value_groups) - self.key_cache.append(key_states_compress) - self.value_cache.append(value_states_compress) k_cache_compressed, v_cache_compressed = init_kv_cache( bsz, num_heads, head_dim, @@ -268,8 +288,8 @@ def update( k_cache_compressed, v_cache_compressed = append_kv_cache( k_cache_compressed, v_cache_compressed, key_states_compress, value_states_compress) - self.key_cache[layer_idx] = k_cache_compressed - self.value_cache[layer_idx] = v_cache_compressed + self.key_cache.append(k_cache_compressed) + self.value_cache.append(v_cache_compressed) if key_states.stride(2) != head_dim: k_cache, v_cache = init_kv_cache( @@ -277,7 +297,8 @@ def update( 0, key_states.size(2), key_states.dtype, key_states.device ) - k_cache, v_cache = append_kv_cache(k_cache, v_cache, key_states, value_states) + k_cache, v_cache = append_kv_cache(k_cache, v_cache, + key_states, value_states) return k_cache, v_cache else: return key_states, value_states @@ -286,13 +307,14 @@ def update( cache_v = self.value_cache[layer_idx] if not enough_kv_room: # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - num_heads, # Support GQA - head_dim, - cache_k.size(2), - cache_k.size(2) + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=query_states.device) + new_c_k, new_c_v = extend_kv_cache( + bsz, + num_heads, # Support GQA + head_dim, + cache_k.size(2), + cache_k.size(2) + KV_CACHE_ALLOC_BLOCK_LENGTH, + dtype=cache_k.dtype, + device=query_states.device) new_c_k[:] = cache_k new_c_v[:] = cache_v @@ -316,3 +338,75 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: if len(self.key_cache) <= layer_idx: return 0 return self.real_kv_len + + +class DynamicCompressFp8Cache(DynamicCompressCache, DynamicFp8Cache): + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + query_states: torch.Tensor, + attention_mask: torch.Tensor, + num_key_value_groups: int, + attn_config: Dict[str, Any], + enough_kv_room: bool, + KV_CACHE_ALLOC_BLOCK_LENGTH: int, + cache_kwargs: Optional[Dict[str, Any]]=None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + bsz, num_heads, seq_len, head_dim = key_states.shape + + if layer_idx == 0: + if hasattr(self, "_seen_tokens"): + # 4.39 uses `_seen_tokens` + self._seen_tokens += seq_len + else: + # 4.37 uses `seen_tokens` + self.seen_tokens += seq_len + self.real_kv_len += seq_len + + # Update the cache + if len(self.key_cache) <= layer_idx: + # First token, compress kv cache + key_states_compress, value_states_compress = compress_kv( + attn_config=attn_config, + key_states=key_states, + query_states=query_states, + value_states=value_states, + attention_mask=attention_mask, + num_key_value_groups=num_key_value_groups) + + k_cache_compressed, v_cache_compressed = init_fp8_kv_cache( + bsz, num_heads, seq_len, head_dim, + device=key_states.device, + ) + + k_cache_compressed, v_cache_compressed = append_fp8_kv_cache( + k_cache_compressed, v_cache_compressed, + key_states_compress, value_states_compress) + self.key_cache.append(k_cache_compressed) + self.value_cache.append(v_cache_compressed) + + if key_states.stride(2) != head_dim: + k_cache, v_cache = init_fp8_kv_cache( + bsz, num_heads, 0, head_dim, key_states.device + ) + k_cache, v_cache = append_fp8_kv_cache(k_cache, v_cache, + key_states, value_states) + return k_cache, v_cache + else: + return key_states, value_states + else: + cache_k = self.key_cache[layer_idx] + cache_v = self.value_cache[layer_idx] + + key_states, value_states = append_fp8_kv_cache(cache_k, + cache_v, + key_states, + value_states) + + # update past_key_value + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + + return self.key_cache[layer_idx], self.value_cache[layer_idx] diff --git a/python/llm/src/ipex_llm/transformers/lookup.py b/python/llm/src/ipex_llm/transformers/lookup.py index 36815902445..c5fe81d49ab 100644 --- a/python/llm/src/ipex_llm/transformers/lookup.py +++ b/python/llm/src/ipex_llm/transformers/lookup.py @@ -21,6 +21,7 @@ # from typing import Callable, List, Optional, Tuple +import os import torch import time import copy @@ -39,6 +40,9 @@ original_generate = GenerationMixin.generate query_group_size = 16 +# may tune it with more tested data +PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD = 100 + @torch.no_grad() def generate( @@ -54,6 +58,28 @@ def generate( **kwargs, ): lookahead = kwargs.pop("lookahead", None) + perf_mode = os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) + + input_tensor_shape = None + is_inputs_embeds = False + if inputs is not None: + input_tensor_shape = inputs.shape + else: + input_ids = kwargs.get("input_ids", None) + if input_ids is not None: + input_tensor_shape = input_ids.shape + else: + inputs_embeds = kwargs.get("inputs_embeds", None) + if inputs_embeds is not None: + is_inputs_embeds = True + input_tensor_shape = inputs_embeds.shape + + if perf_mode == "1" and lookahead is None: + if input_tensor_shape is not None and \ + input_tensor_shape[1] >= PERFORMANCE_MODE_LOOKUP_INPUT_THRESHOLD \ + and not is_inputs_embeds: + lookahead = 2 # default to 2 now + if lookahead: from ipex_llm.transformers.convert import get_enable_ipex _enable_ipex = get_enable_ipex() @@ -61,7 +87,15 @@ def generate( if self.device.type == "cpu" and _enable_ipex: logger.warning("Prompt lookup is currently not supported on CPU with IPEX, " "fallback to original generate.") - kwargs.pop("max_matching_ngram_size") + kwargs.pop("max_matching_ngram_size", None) + elif input_tensor_shape is not None and input_tensor_shape[0] > 1: + logger.warning("Prompt lookup is currently not supported with batch inference, " + "fallback to original generate.") + kwargs.pop("max_matching_ngram_size", None) + elif kwargs.get("num_beams", None) not in [None, 1]: + logger.warning("Prompt lookup is currently not supported with num_beams != 1, " + "fallback to original generate.") + kwargs.pop("max_matching_ngram_size", None) else: # Do prompt lookup generation # If lookahead is provided, we will use lookup_generate instead of @@ -80,6 +114,7 @@ def generate( return self.lookup_generate(inputs=inputs, num_output_tokens=lookahead, generation_config=generation_config, + streamer=streamer, logits_processor=logits_processor, stopping_criteria=stopping_criteria, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, @@ -143,9 +178,9 @@ def __init__( def init_look_up_table(self, input_ids: torch.LongTensor): - for ngram_size in range(self.max_matching_ngram_size, 0, -1): + for ngram_size in range(min(self.max_matching_ngram_size, input_ids.shape[1]), 0, -1): # Create sliding windows of size ngram_size - windows = input_ids.unfold(dimension=1, size=ngram_size, step=1) + windows = input_ids.cpu().unfold(dimension=1, size=ngram_size, step=1) for idx in range(windows.size(1)): window = tensor2key(windows[0, idx]) if window not in self.lookup_table: @@ -240,12 +275,19 @@ def lookup_generate(self, num_output_tokens: int = 10, max_matching_ngram_size: int = None, generation_config: Optional[GenerationConfig] = None, + streamer: Optional["BaseStreamer"] = None, attention_mask=None, **sampling_kwargs): input_ids, generation_config, logits_processor, stopping_criteria, \ model_kwargs = _prepare_generate_args(self, inputs, generation_config, **sampling_kwargs) + invalidInputError(input_ids.shape[0] == 1, + "Prompt lookup is currently not supported with batch inference.") + + if streamer is not None: + streamer.put(input_ids.cpu()) + device_name = get_xpu_device_type(input_ids) candidates_generator = PromptLookupCandidateGenerator( @@ -262,6 +304,13 @@ def lookup_generate(self, past_key_values = None input_len = input_ids.shape[1] + eos_token_id_set = None + if generation_config.eos_token_id is not None: + if isinstance(generation_config.eos_token_id, list): + eos_token_id_set = set(generation_config.eos_token_id) + else: + eos_token_id_set = set([generation_config.eos_token_id]) + while True: if step >= max_new_tokens: break @@ -269,11 +318,9 @@ def lookup_generate(self, if step == 0: # first token use full model tic = time.time() - output = self(input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - return_dict=True, - use_cache=True) + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + output = self(**model_inputs, + return_dict=True) logits = output['logits'] logits = logits[:, -1:] logits[:, -1, :] = logits_processor(input_ids, logits[:, -1, :]) @@ -364,8 +411,9 @@ def lookup_generate(self, accept_rate = self.n_matched/self.n_drafted if self.n_drafted > 0 else 1 self.accept_rate.append(accept_rate) # Update the candidate generation strategy if needed - candidates_generator.update_candidate_strategy(candidate_length, n_matches, - accept_rate) + if device_name != 'mtl': + candidates_generator.update_candidate_strategy(candidate_length, n_matches, + accept_rate) input_ids = torch.cat((input_ids, output_ids), dim=-1) candidates_generator.update_look_up_table(input_ids) @@ -376,15 +424,27 @@ def lookup_generate(self, self.post_time.append(pot-mot) # Stop on eos and remove content after eos - output_ids_list = output_ids[0].tolist() - if generation_config.eos_token_id in output_ids_list: - idx = output_ids_list.index(generation_config.eos_token_id) - step -= (len(output_ids_list) - idx - 1) - break + if eos_token_id_set is not None: + output_ids_list = output_ids[0].tolist() + first_eos_idx = -1 + for out_idx, out_id in enumerate(output_ids_list): + if out_id in eos_token_id_set: + first_eos_idx = out_idx + break + if first_eos_idx > -1: + if streamer is not None: + streamer.put(output_ids[:(first_eos_idx + 1)].cpu()) + step -= (len(output_ids_list) - first_eos_idx - 1) + break + if streamer is not None: + streamer.put(output_ids.cpu()) step = min(step, max_new_tokens) e2e_toc = time.time() self.n_token_generated = step self.e2e_time_without_first = e2e_toc - e2e_tic + if streamer is not None: + streamer.end() + return input_ids[:, : input_len + step] diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index c30ca4a284e..fddcb7c9440 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -73,6 +73,7 @@ MOFQ8 = ggml_tensor_qtype["mixed_fp8"] FP8E5 = ggml_tensor_qtype["fp8_e5m2"] FP6 = ggml_tensor_qtype["fp6"] +FP16 = ggml_tensor_qtype["fp16"] IQ2_XXS = ggml_tensor_qtype["gguf_iq2_xxs"] IQ2_XS = ggml_tensor_qtype["gguf_iq2_xs"] Q2_K = ggml_tensor_qtype["q2_k"] @@ -228,6 +229,13 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, dst_tensor = dst_tensor.reshape(tensor.shape[0], tensor.shape[-1] // QK) scale = torch.empty(n // k, dtype=torch.float32, device=device) + elif qtype == NF4: + # Deepspeed zero3 requires unified dtype, + # thus here uses bfloat16 consistent to other layers + # dst_size above is computed based on uint8, and for bfloat16, + # buffer size should be half + dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, + device=device) else: dst_tensor = torch.empty(dst_size, dtype=torch.uint8, device=device) @@ -259,12 +267,15 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): - - invalidInputError(tensor.dtype == torch.uint8, - "Input tensor must be uint8") + if qtype == NF4: + invalidInputError(tensor.dtype == torch.bfloat16, + "NF4 Input tensor must be bfloat16") + else: + invalidInputError(tensor.dtype == torch.uint8, + "Input tensor except NF4 must be uint8") invalidInputError(tensor.device == torch.device('cpu'), - "Input tensor must be uint8") + "Input tensor must be on cpu") src = ctypes.c_void_p(tensor.data.data_ptr()) @@ -369,7 +380,6 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int): # Rename to FP4Params to trigger initializing # the params layer with all parameters on the CPU -# https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/modeling.py#L333 class FP4Params(torch.nn.Parameter): def __new__(cls, data=None, @@ -581,7 +591,13 @@ class MatMulLowBit(torch.autograd.Function): def forward(ctx, A, weight, input_seq_size): ctx.is_empty = False import xe_linear - result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size) + if weight.qtype == NF4: + result = xe_linear.forward_new(A, + weight.data.view(torch.uint8), + weight.qtype, + input_seq_size) + else: + result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size) if any(ctx.needs_input_grad[:2]): ctx.tensors = (A, weight) else: @@ -601,7 +617,12 @@ def backward(ctx, grad_output): if req_gradA: if torch.xpu.is_autocast_xpu_enabled(): grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype()) - dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype) + if weight.qtype == NF4: + dequant_weight = xe_linear.dequant(A, + weight.data.view(torch.uint8), + weight.qtype) + else: + dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype) grad_A = torch.matmul(grad_output, dequant_weight.reshape(weight._shape)) return grad_A, grad_weight, None @@ -736,9 +757,16 @@ def forward(self, x: torch.Tensor): if x_2d.requires_grad: result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size) else: - result = xe_linear.forward_new(x_2d, self.weight.data, - self.weight.qtype, - input_seq_size) + if self.weight.qtype == NF4: + result = xe_linear.forward_new(x_2d, + self.weight.data.view(torch.uint8), + self.weight.qtype, + input_seq_size) + else: + result = xe_linear.forward_new(x_2d, + self.weight.data, + self.weight.qtype, + input_seq_size) elif self.enable_xetla: x_2d = x_2d.half() result = xe_linear.mm_xetla(x_2d, self.weight.data, self.qtype) @@ -981,3 +1009,38 @@ def forward(self, x: torch.Tensor): result = result.reshape(*original_shape[:-1], result.shape[-1]) return result.to(x.dtype) + + +class vLLMLowBitLinear(LowBitLinear): + def __init__(self, input_features, output_features, qtype, bias=True, + conver_to_half=True, mp_group=None, enable_xetla=False, + optimize_lm_head=False, act_order=False, + enable_scale_search=False): + super().__init__(input_features, output_features, qtype, bias, conver_to_half, mp_group, + enable_xetla, optimize_lm_head, act_order, enable_scale_search) + + def forward(self, x: torch.Tensor): + result = super().forward(x) + return result, None + + +class vLLMFP16Linear(FP16Linear): + def __init__(self, input_features, output_features, bias=True, mp_group=None, weight_type=1, + enable_xetla=False, optimize_lm_head=False): + super().__init__(input_features, output_features, bias, mp_group, weight_type, + enable_xetla, optimize_lm_head) + + def forward(self, x: torch.Tensor): + result = super().forward(x) + return result, None + + +class vLLMBF16Linear(BF16Linear): + def __init__(self, input_features, output_features, bias=True, mp_group=None, + compute_dtype=None, enable_xetla=False, optimize_lm_head=False): + super().__init__(input_features, output_features, bias, mp_group, compute_dtype, + enable_xetla, optimize_lm_head) + + def forward(self, x: torch.Tensor): + result = super().forward(x) + return result, None diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index d9dd6354970..7290acaed88 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -396,6 +396,8 @@ def from_pretrained(cls, from .lookup import lookup_generate import types model.lookup_generate = types.MethodType(lookup_generate, model) + if model.config.model_type == "minicpmv" and hasattr(model, 'llm'): + model.llm.lookup_generate = types.MethodType(lookup_generate, model.llm) else: # load default model = cls.HF_Model.from_pretrained(*args, **kwargs) diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py index c74e97543ef..9d41279244b 100644 --- a/python/llm/src/ipex_llm/transformers/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py @@ -19,17 +19,25 @@ # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py import math -from typing import Optional, Tuple +from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch.nn import functional as F -from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache, \ + should_use_compresskv, get_compresskv_attn_mask from ipex_llm.transformers.models.utils import update_past_key_value from ipex_llm.transformers.models.utils import should_use_fuse_rope from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_causal from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU from ipex_llm.transformers.models.utils import mlp_fusion_check +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36 +from ipex_llm.transformers.kv import DynamicCompressFp8Cache, DynamicCompressCache +from ipex_llm.transformers.models.utils import extend_kv_cache, append_kv_cache import warnings +import os + +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def pre_compute_inv_freq(module: torch.nn.Module): @@ -71,6 +79,161 @@ def baichuan_mlp_forward( return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) +def baichuan_model_7b_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None \ + else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # IPEX-LLM OPT: compress kv and quantize kv + if use_cache: + inputs = input_ids if input_ids is not None else inputs_embeds + use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) + use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs) + if use_compress_kv and not isinstance(past_key_values, + DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at \ + the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + log4Error.invalidInputError("You have to specify either decoder_input_ids \ + or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + # IPEX-LLM OPT: compress kv + if isinstance(past_key_values, DynamicCompressCache): + past_key_values_length = past_key_values.get_seq_length() + else: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange(past_key_values_length, seq_length + past_key_values_length, + dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + # IPEX-LLM OPT: compress kv + use_compresskv = isinstance(past_key_values, DynamicCompressCache) + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # IPEX-LLM OPT: compress kv + if not use_compresskv: + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + # IPEX-LLM OPT: compress kv + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values if use_compresskv else past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + # IPEX-LLM OPT: compress kv + if use_compresskv: + next_decoder_cache = past_key_values + else: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def baichuan_attention_forward_7b( self, hidden_states: torch.Tensor, @@ -83,6 +246,9 @@ def baichuan_attention_forward_7b( bsz, q_len, _ = hidden_states.size() device = hidden_states.device + # [CompressKV] + use_compresskv = isinstance(past_key_value, DynamicCompressCache) + qkv = self.W_pack(hidden_states) qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) qkv = qkv.transpose(1, 2) @@ -92,7 +258,12 @@ def baichuan_attention_forward_7b( kv_seq_len = key_states.shape[2] if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[2] + # [CompressKV] + if use_compresskv: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, + self.layer_idx) + else: + kv_seq_len += past_key_value[0].shape[2] # IPEX-LLM OPT: fuse rope if should_use_fuse_rope(hidden_states, position_ids, self.training): @@ -108,11 +279,22 @@ def baichuan_attention_forward_7b( # IPEX-LLM OPT: kv cache and quantize kv use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states) - key_states, value_states = update_past_key_value( - past_key_value, key_states, value_states, - kv_seq_len, use_quantize_kv, device - ) - past_key_value = (key_states, value_states) if use_cache else None + + # [CompressKV] + if use_compresskv: + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, + self.layer_idx, + q_len) + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, 1, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = update_past_key_value( + past_key_value, key_states, value_states, + kv_seq_len, use_quantize_kv, device + ) + past_key_value = (key_states, value_states) if use_cache else None if self.training: warnings.warn("xops is not supported on Intel GPU, so just use normal implementation") @@ -127,6 +309,8 @@ def baichuan_attention_forward_7b( is_causal=True).to(hidden_states.dtype) elif use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons + if use_compresskv: + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) if use_quantize_kv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, attention_mask) diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py index fe504960190..b13943020b4 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm2.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py @@ -17,6 +17,7 @@ # https://huggingface.co/THUDM/chatglm2-6b/blob/8eb45c842594b8473f291d0f94e7bbe86ffc67d8/modeling_chatglm.py # +import os import math import torch from typing import Optional, Tuple @@ -27,7 +28,9 @@ from ipex_llm.transformers.models.utils import should_use_fuse_rope, apply_rotary_pos_emb from ipex_llm.transformers.models.utils import use_quantize_kv_cache, use_sdp, \ use_sdp_causal, should_use_compresskv, is_enough_kv_cache_room_4_36 -from ipex_llm.transformers.kv import DynamicCompressCache +from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cache + +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -87,12 +90,15 @@ def chatglm2_model_forward( dtype=inputs_embeds.dtype, device=inputs_embeds.device) if use_cache: - use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[-1]) + use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[1]) use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.dense_h_to_4h, input_ids) - if use_compress_kv and not use_quantize_kv and not isinstance(past_key_values, - DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + if use_compress_kv and not isinstance(past_key_values, + DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) if full_attention_mask is None: if (attention_mask is not None and not attention_mask.all()) or ( @@ -108,7 +114,10 @@ def chatglm2_model_forward( if past_key_values is None: position_ids = torch.arange(seq_length, dtype=torch.int64, device=inputs_embeds.device) else: - kv_length = past_key_values[0][0].size(0) + if isinstance(past_key_values, DynamicCompressCache): + kv_length = past_key_values.get_seq_length() + else: + kv_length = past_key_values[0][0].size(0) position_ids = torch.arange(kv_length, kv_length + seq_length, dtype=torch.int64, device=inputs_embeds.device) position_ids = position_ids.repeat(batch_size, 1) @@ -276,7 +285,20 @@ def chatglm2_attention_forward( # IPEX-LLM OPT: kv cache and quantize kv use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states) - if use_quantize_kv or (not use_compresskv): + + # [CompressKV] + if use_compresskv: + from transformers.configuration_utils import PretrainedConfig + self.config = self.config if hasattr(self, "config") else PretrainedConfig() + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, + self.layer_number - 1, + q_len) + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_number - 1, + query_states, attention_mask, n_head // n_kv_head, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH + ) + else: key_states, value_states = update_past_key_value( past_key_value, key_states, value_states, kv_seq_len, use_quantize_kv, hidden_states.device @@ -284,20 +306,13 @@ def chatglm2_attention_forward( # past_key_value: [bsz, n_kv_head, seq_len, head_dim] -> [seq_len, bsz, n_kv_head, head_dim] past_key_value = (key_states.permute(2, 0, 1, 3), value_states.permute(2, 0, 1, 3)) if use_cache else None - else: - from transformers.configuration_utils import PretrainedConfig - self.config = self.config if hasattr(self, "config") else PretrainedConfig() - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_number - 1) - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_number - 1, - query_states, attention_mask, n_head // n_kv_head, - self.config, enough_kv_room, 256 - ) # IPEX-LLM OPT: sdp attn_weights = None if use_sdp(q_len, kv_seq_len, head_dim, query_states): import xe_addons + if use_compresskv and attention_mask is not None: + attention_mask = None if use_quantize_kv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, attention_mask) else: diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm4.py b/python/llm/src/ipex_llm/transformers/models/chatglm4.py index 46d7d78035b..2daffedb118 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm4.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm4.py @@ -17,17 +17,21 @@ # https://huggingface.co/THUDM/chatglm2-6b-32k/blob/main/configuration_chatglm.py # +import os import torch from typing import Optional, Tuple, Union from ipex_llm.transformers.models.utils import restore_fp8_kv_cache, update_past_key_value from ipex_llm.transformers.models.utils import use_quantize_kv_cache, use_sdp, \ - use_sdp_causal, should_use_compresskv, is_enough_kv_cache_room_4_36 + use_sdp_causal, should_use_compresskv, is_enough_kv_cache_room_4_36, \ + get_compresskv_attn_mask from ipex_llm.transformers.models.utils import should_use_fuse_rope, apply_rotary_pos_emb from ipex_llm.transformers.models.chatglm2 import repeat_kv -from ipex_llm.transformers.kv import DynamicCompressCache +from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cache from transformers.modeling_outputs import BaseModelOutputWithPast import math +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) + def chatglm4_model_forward( self, @@ -50,12 +54,15 @@ def chatglm4_model_forward( if use_cache: inputs = input_ids if input_ids is not None else inputs_embeds - use_compress_kv = should_use_compresskv(inputs, inputs.shape[-1]) + use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.dense_h_to_4h, inputs) - if use_compress_kv and not use_quantize_kv and not isinstance(past_key_values, - DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + if use_compress_kv and not isinstance(past_key_values, + DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) if inputs_embeds is None: batch_size, seq_length = input_ids.shape @@ -79,7 +86,10 @@ def chatglm4_model_forward( if past_key_values is None: position_ids = torch.arange(seq_length, dtype=torch.int64, device=inputs_embeds.device) else: - kv_length = past_key_values[0][0].size(2) + if isinstance(past_key_values, DynamicCompressCache): + kv_length = past_key_values.get_seq_length() + else: + kv_length = past_key_values[0][0].size(2) position_ids = torch.arange(kv_length, kv_length + seq_length, dtype=torch.int64, device=inputs_embeds.device) position_ids = position_ids.repeat(batch_size, 1) @@ -197,7 +207,19 @@ def chatglm4_attention_forward( # IPEX-LLM OPT: kv cache and quantize kv use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states) - if use_quantize_kv or (not use_compresskv): + # [CompressKV] + if use_compresskv: + from transformers.configuration_utils import PretrainedConfig + self.config = self.config if hasattr(self, "config") else PretrainedConfig() + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, + self.layer_number - 1, + q_len) + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_number - 1, + query_states, attention_mask, n_head // n_kv_head, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH + ) + else: key_states, value_states = update_past_key_value( past_key_value, key_states, value_states, kv_seq_len, use_quantize_kv, hidden_states.device @@ -210,20 +232,13 @@ def chatglm4_attention_forward( past_key_value = (key_states, value_states) else: past_key_value = None - else: - from transformers.configuration_utils import PretrainedConfig - self.config = self.config if hasattr(self, "config") else PretrainedConfig() - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_number - 1) - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_number - 1, - query_states, attention_mask, n_head // n_kv_head, - self.config, enough_kv_room, 256 - ) # IPEX-LLM OPT: sdp attn_weights = None if use_sdp(q_len, kv_seq_len, head_dim, query_states): import xe_addons + if use_compresskv: + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) if use_quantize_kv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, attention_mask) else: diff --git a/python/llm/src/ipex_llm/transformers/models/common.py b/python/llm/src/ipex_llm/transformers/models/common.py index e1522c4e957..215232e48f0 100644 --- a/python/llm/src/ipex_llm/transformers/models/common.py +++ b/python/llm/src/ipex_llm/transformers/models/common.py @@ -19,28 +19,36 @@ def merge_linear(linears: List[torch.nn.Linear]) -> torch.nn.Linear: - new_weight = torch.cat(list(linear.weight.data for linear in linears), dim=0) - if linears[0].bias is not None: - new_linear = torch.nn.Linear(0, 0, bias=True) - new_bias = torch.cat(list(linear.bias.data for linear in linears), dim=0) - new_linear.bias = torch.nn.Parameter(new_bias, requires_grad=False) + if hasattr(linears[0], "weight"): + # For GPTQ model, it might be qweight + new_weight = torch.cat(list(linear.weight.data for linear in linears), dim=0) + if linears[0].bias is not None: + new_linear = torch.nn.Linear(0, 0, bias=True) + new_bias = torch.cat(list(linear.bias.data for linear in linears), dim=0) + new_linear.bias = torch.nn.Parameter(new_bias, requires_grad=False) + else: + new_linear = torch.nn.Linear(0, 0, bias=False) + new_linear.weight = torch.nn.Parameter(new_weight, requires_grad=False) + new_linear.in_features = new_weight.size(1) + new_linear.out_features = new_weight.size(0) + return new_linear else: - new_linear = torch.nn.Linear(0, 0, bias=False) - new_linear.weight = torch.nn.Parameter(new_weight, requires_grad=False) - new_linear.in_features = new_weight.size(1) - new_linear.out_features = new_weight.size(0) - return new_linear + return None def merge_qkv_base(module: torch.nn.Module, attention_class): - if isinstance(module, attention_class): + if ( + isinstance(attention_class, str) and module.__class__.__name__ == attention_class + or not isinstance(attention_class, str) and isinstance(module, attention_class) + ): qkv_proj = merge_linear([ module.q_proj, module.k_proj, module.v_proj, ]) - module.qkv_proj = qkv_proj - del module.q_proj, module.k_proj, module.v_proj + if qkv_proj is not None: + module.qkv_proj = qkv_proj + del module.q_proj, module.k_proj, module.v_proj def fuse_mlp_base(module: torch.nn.Module, act: int, x: torch.Tensor): @@ -59,3 +67,13 @@ def fuse_mlp_base(module: torch.nn.Module, act: int, x: torch.Tensor): ) else: return module.down_proj(module.act_fn(module.gate_proj(x)) * module.up_proj(x)) + + +def attention_softmax(attn_weights: torch.Tensor, training: bool): + if attn_weights.is_contiguous() and attn_weights.device.type == "xpu" and not training: + import xe_addons + xe_addons.attn_softmax_inplaced(attn_weights) + else: + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(attn_weights.dtype) + return attn_weights diff --git a/python/llm/src/ipex_llm/transformers/models/gemma2.py b/python/llm/src/ipex_llm/transformers/models/gemma2.py index 33201864223..07f8314a021 100644 --- a/python/llm/src/ipex_llm/transformers/models/gemma2.py +++ b/python/llm/src/ipex_llm/transformers/models/gemma2.py @@ -129,7 +129,8 @@ def gemma2_attention_forward( # IPEX_LLM OPT: sdp kv_seq_len = q_len if past_key_value is None else past_key_value.kv_seq_len if (use_sdp_causal(q_len, kv_seq_len, -1, query_states, self.training) - and kv_seq_len <= key_states.size(2)): + and kv_seq_len <= key_states.size(2) and + (self.sliding_window is None or kv_seq_len < self.sliding_window)): import xe_addons attn_weights = None attn_output = xe_addons.gemma2_sdp_causal(query_states, @@ -141,10 +142,15 @@ def gemma2_attention_forward( elif use_sdp(q_len, kv_seq_len, -1, query_states): import xe_addons attn_weights = None + if self.sliding_window is not None: + attn_mask = attention_mask[:, :, :q_len, : key_states.shape[-2]] + else: + attn_mask = attention_mask + attn_output = xe_addons.gemma2_sdp(query_states, - key_states[:, :, :kv_seq_len, :], - value_states[:, :, :kv_seq_len, :], - attention_mask[:, :, :q_len, :kv_seq_len], + key_states, + value_states, + attn_mask, self.config.attn_logit_softcapping, self.scaling) else: diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index e1b2d5f11b1..873407fbddd 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -42,7 +42,8 @@ from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache from ipex_llm.transformers.models.utils import SILU from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache, should_use_compresskv + restore_fp8_kv_cache, use_quantize_kv_cache, should_use_compresskv, \ + get_compresskv_attn_mask from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ apply_rotary_pos_emb, is_enough_kv_cache_room_4_36 from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu @@ -119,19 +120,27 @@ def llama_model_forward_4_36( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache + from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache, \ + DynamicCompressFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache input = input_ids if input_ids is not None else inputs_embeds if use_cache: - if use_quantize_kv_cache(self.layers[0].mlp.up_proj, input, - self.config.num_attention_heads//self.config.num_key_value_heads): + use_quantize = use_quantize_kv_cache( + self.layers[0].mlp.up_proj, input, + self.config.num_attention_heads//self.config.num_key_value_heads) + use_compresskv = should_use_compresskv(input, input.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) + if use_compresskv: + if not isinstance(past_key_values, DynamicCompressCache): + if use_quantize: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache( + past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache( + past_key_values) + elif use_quantize: if not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif should_use_compresskv(input, input.shape[-1]): - # if use quantize kv, compress kv will be ignored now - if not isinstance(past_key_values, DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache( - past_key_values) return llama_model_forward_4_36_internal( self=self, input_ids=input_ids, @@ -159,19 +168,27 @@ def llama_model_forward_4_38( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache + from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache, \ + DynamicCompressFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache input = input_ids if input_ids is not None else inputs_embeds if use_cache: - if use_quantize_kv_cache(self.layers[0].mlp.up_proj, input, - self.config.num_attention_heads//self.config.num_key_value_heads): + use_quantize = use_quantize_kv_cache( + self.layers[0].mlp.up_proj, input, + self.config.num_attention_heads//self.config.num_key_value_heads) + use_compresskv = should_use_compresskv(input, input.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) + if use_compresskv: + if not isinstance(past_key_values, DynamicCompressCache): + if use_quantize: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache( + past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache( + past_key_values) + elif use_quantize: if not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif should_use_compresskv(input, input.shape[-1]): - # if use quantize kv, compress kv will be ignored now - if not isinstance(past_key_values, DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache( - past_key_values) return llama_model_forward_4_38_internal( self=self, input_ids=input_ids, @@ -200,19 +217,27 @@ def llama_model_forward_4_41( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache + from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache, \ + DynamicCompressFp8Cache use_cache = use_cache if use_cache is not None else self.config.use_cache input = input_ids if input_ids is not None else inputs_embeds if use_cache: - if use_quantize_kv_cache(self.layers[0].mlp.up_proj, input, - self.config.num_attention_heads//self.config.num_key_value_heads): + use_quantize = use_quantize_kv_cache( + self.layers[0].mlp.up_proj, input, + self.config.num_attention_heads//self.config.num_key_value_heads) + use_compresskv = should_use_compresskv(input, input.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) + if use_compresskv: + if not isinstance(past_key_values, DynamicCompressCache): + if use_quantize: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache( + past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache( + past_key_values) + elif use_quantize: if not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif should_use_compresskv(input, input.shape[-1]): - # if use quantize kv, compress kv will be ignored now - if not isinstance(past_key_values, DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache( - past_key_values) return llama_model_forward_4_41_internal( self=self, input_ids=input_ids, @@ -280,6 +305,16 @@ def llama_mlp_forward( ) hidden_states = attn_output.view(x.shape) return hidden_states + elif x.device.type == "xpu" and not self.training: + import xe_addons + gate = self.gate_proj(x) + up = self.up_proj(x) + xe_addons.mlp_silu_mul_inplaced(gate, up) + out = self.down_proj(gate) + if residual is not None: + return out + residual + else: + return out else: a = self.act_fn(self.gate_proj(x)) b = self.up_proj(x) @@ -1085,6 +1120,7 @@ def llama_attention_forward_4_41_quantized( cache_position: Optional[torch.LongTensor] = None, **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: + from ipex_llm.transformers.kv import DynamicCompressCache if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. " @@ -1101,6 +1137,9 @@ def llama_attention_forward_4_41_quantized( enough_kv_room, bsz * q_len, llama_decoding_fast_path_qtype_check) and no_tp + # [CompressKV] + use_compresskv = isinstance(past_key_value, DynamicCompressCache) + if decoding_fast_path: hidden_states = hidden_states.view(1, -1) tmp_cache_k, tmp_cache_v = init_kv_cache( @@ -1176,8 +1215,15 @@ def llama_attention_forward_4_41_quantized( repeated_value_states = repeat_kv(value_states, self.num_key_value_groups) if use_cache: cache_kwargs = None - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + # [CompressKV] + if use_compresskv: + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) if use_cache and use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): import xe_addons @@ -1226,8 +1272,15 @@ def llama_attention_forward_4_41_quantized( attn_output = torch.matmul(attn_weights, repeated_value_states) else: cache_kwargs = None # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + # [CompressKV] + if use_compresskv: + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) kv_seq_len = key_states.shape[-2] if not use_sdp_fp8(q_len, key_states.shape[2], query_states): key_states, value_states = restore_fp8_kv_cache(key_states, value_states, @@ -1274,6 +1327,11 @@ def llama_attention_forward_4_41_quantized( new_attn_mask = attention_mask[:, :, :, 0:kv_seq_len] else: new_attn_mask = attention_mask + + # [CompressKV] + if use_compresskv: + new_attn_mask = get_compresskv_attn_mask(key_states, + new_attn_mask) attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, new_attn_mask) attn_weights = None @@ -1521,7 +1579,7 @@ def llama_attention_forward_4_41_original( past_key_value.key_cache[self.layer_idx] = key_states past_key_value.value_cache[self.layer_idx] = value_states - if cache_position is not None: + if attention_mask is not None: new_attention_mask = attention_mask[:, :, :, 0:kv_seq_len] else: new_attention_mask = attention_mask @@ -1547,9 +1605,10 @@ def llama_attention_forward_4_41_original( elif not self.training and not hidden_states.requires_grad and \ use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): import xe_addons + # [CompressKV] if use_compresskv: - # [CompressKV] set attention_mask = None - new_attention_mask = None + new_attention_mask = get_compresskv_attn_mask(key_states, + new_attention_mask) attn_output = xe_addons.sdp(query_states, key_states, value_states, new_attention_mask) attn_output = attn_output.view(query_states.shape) @@ -1650,6 +1709,7 @@ def llama_attention_forward_4_38_quantized( cache_position: Optional[torch.LongTensor] = None, **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: + from ipex_llm.transformers.kv import DynamicCompressCache if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. " @@ -1666,6 +1726,10 @@ def llama_attention_forward_4_38_quantized( enough_kv_room, bsz * q_len, llama_decoding_fast_path_qtype_check) and no_tp + + # [CompressKV] + use_compresskv = isinstance(past_key_value, DynamicCompressCache) + if decoding_fast_path: hidden_states = hidden_states.view(1, -1) tmp_cache_k, tmp_cache_v = init_kv_cache( @@ -1741,8 +1805,16 @@ def llama_attention_forward_4_38_quantized( repeated_value_states = repeat_kv(value_states, self.num_key_value_groups) if use_cache: cache_kwargs = None - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + # [CompressKV] + if use_compresskv: + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) + if use_cache and use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): import xe_addons @@ -1791,8 +1863,15 @@ def llama_attention_forward_4_38_quantized( attn_output = torch.matmul(attn_weights, repeated_value_states) else: cache_kwargs = None # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + # [CompressKV] + if use_compresskv: + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) kv_seq_len = key_states.shape[-2] if not use_sdp_fp8(q_len, key_states.shape[2], query_states): key_states, value_states = restore_fp8_kv_cache(key_states, value_states, @@ -1839,6 +1918,11 @@ def llama_attention_forward_4_38_quantized( new_attn_mask = attention_mask[:, :, kv_seq_len-q_len:kv_seq_len, 0:kv_seq_len] else: new_attn_mask = attention_mask + + # [CompressKV] + if use_compresskv: + new_attn_mask = get_compresskv_attn_mask(key_states, + new_attn_mask) attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, new_attn_mask) attn_weights = None @@ -2111,9 +2195,10 @@ def llama_attention_forward_4_38_original( elif not self.training and not hidden_states.requires_grad and \ use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): import xe_addons + # [CompressKV] if use_compresskv: - # [CompressKV] set attention_mask = None - new_attention_mask = None + new_attention_mask = get_compresskv_attn_mask(key_states, + new_attention_mask) attn_output = xe_addons.sdp(query_states, key_states, value_states, new_attention_mask) attn_output = attn_output.view(query_states.shape) diff --git a/python/llm/src/ipex_llm/transformers/models/minicpm.py b/python/llm/src/ipex_llm/transformers/models/minicpm.py index 50c0b9ceae0..d248c507773 100644 --- a/python/llm/src/ipex_llm/transformers/models/minicpm.py +++ b/python/llm/src/ipex_llm/transformers/models/minicpm.py @@ -38,566 +38,124 @@ import torch import warnings -import importlib import torch.nn as nn from typing import Optional, Tuple, Union, List import math -import os -import torch.nn.functional as F -from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from ipex_llm.transformers.models.utils import SILU -from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache -from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ - apply_rotary_pos_emb, is_enough_kv_cache_room_4_36, should_use_compresskv -from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu -from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_fp8 -from ipex_llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check -from ipex_llm.transformers.models.utils import use_decoding_fast_path -from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.models.llama.modeling_llama import LlamaModel -from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS, FP4 -from ipex_llm.ggml.quantize import ggml_tensor_qtype -from ipex_llm.utils.common import invalidInputError -from ipex_llm.transformers.models.llama import should_use_fuse_rope, should_use_xetla_mm_qkv -from ipex_llm.transformers.models.llama import fuse_qkv_weight_xetla, repeat_kv, native_sdp -from ipex_llm.transformers.models.llama import llama_decoding_fast_path_qtype_check -from ipex_llm.transformers.models.llama import should_split_qkv_tensor, should_split_qkv_tensor +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, is_enough_kv_cache_room_4_36 +from ipex_llm.transformers.models.utils import use_sdp, use_sdp_causal, use_quantize_kv_cache +from ipex_llm.transformers.models.utils import restore_fp8_kv_cache, get_compresskv_attn_mask +from ipex_llm.transformers.models.utils import should_use_compresskv, should_use_fuse_rope +from ipex_llm.transformers.models.llama import repeat_kv +from ipex_llm.transformers.models.common import merge_qkv_base +from ipex_llm.transformers.kv import DynamicNormalCache, DynamicFp8Cache, \ + DynamicCompressCache, DynamicCompressFp8Cache +from transformers.cache_utils import Cache -try: - from transformers.cache_utils import Cache, DynamicCache -except ImportError: - Cache = Tuple[torch.Tensor] -from transformers import logging -KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) - -def minicpm_attention_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[List[torch.FloatTensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: - if use_quantize_kv_cache(self.q_proj, hidden_states, self.num_key_value_groups): - forward_function = minicpm_attention_forward_quantized - else: - forward_function = minicpm_attention_forward_original - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - kwargs=kwargs - ) +def merge_qkv(module: torch.nn.Module): + return merge_qkv_base(module, "MiniCPMAttention") -def minicpm_attention_forward_original( +def minicpm_attention_forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[List[torch.FloatTensor]] = None, + past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: - from ipex_llm.transformers.kv import DynamicCompressCache + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. " "Please make sure use `attention_mask` instead.`" ) + bsz, q_len, _ = hidden_states.size() - bsz, q_len, hidden_size = hidden_states.size() - device = hidden_states.device - # for flash attention - original_dtype = hidden_states.dtype + qkv = self.qkv_proj(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) + qkv = qkv.transpose(1, 2) + query_states, key_states, value_states = qkv.split([self.num_heads, + self.num_key_value_heads, + self.num_key_value_heads], dim=1) # [CompressKV] use_compresskv = isinstance(past_key_value, DynamicCompressCache) + use_quantizekv = isinstance(past_key_value, DynamicFp8Cache) - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) - no_tp = not self.config.pretraining_tp > 1 - decoding_fast_path = use_decoding_fast_path(self.q_proj, - use_fuse_rope, - enough_kv_room, - bsz * q_len, - llama_decoding_fast_path_qtype_check) and no_tp - - # single batch decoding fast path - # forward_qkv takes will perform QKV projection, rotary position embedding - # and save the key/value states to cache, then return query states and the - # extended key/value cache - if decoding_fast_path: - hidden_states = hidden_states.view(1, -1) - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - kv_seq_len = cache_k.shape[-2] - import xe_linear - query_states, key_states, value_states = xe_linear.forward_qkv(hidden_states, - self.q_proj.weight, - self.k_proj.weight, - self.v_proj.weight, - position_ids, - cache_k, cache_v, - self.q_proj.weight.qtype, - self.v_proj.weight.qtype, - kv_seq_len, - self.head_dim, - self.rotary_emb.base,) - kv_seq_len += 1 - # update past_key_value's seem_tokens and kv caches. - # [CompressKV] - if use_compresskv: - past_key_value.update_seen_tokens(self.layer_idx, q_len) - kv_seq_len = past_key_value.get_seq_length() - elif self.layer_idx == 0: - past_key_value.seen_tokens = kv_seq_len - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + if should_use_fuse_rope(hidden_states, position_ids, self.training): + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: - if self.config.pretraining_tp > 1: - key_value_slicing = ((self.num_key_value_heads * self.head_dim) // - self.config.pretraining_tp) - query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) - // self.config.pretraining_tp, dim=0) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) - for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) - for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) - for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - else: - if fp16_fusion_check(self.q_proj, hidden_states, self.training) and \ - hidden_size == 4096 and self.q_proj.out_features == self.k_proj.out_features: - # only use mm_qkv_out on pvc for llama-7b - if not hasattr(self, "qkv_proj_weight"): - self.qkv_proj_weight = torch.stack([self.q_proj.weight, - self.k_proj.weight, - self.v_proj.weight]).contiguous() - self.q_proj.weight.data = self.qkv_proj_weight[0, :, :] - self.k_proj.weight.data = self.qkv_proj_weight[1, :, :] - self.v_proj.weight.data = self.qkv_proj_weight[2, :, :] - torch.xpu.empty_cache() - query_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - key_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - value_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - torch.ops.torch_ipex.mm_qkv_out( - hidden_states, self.qkv_proj_weight, None, - query_states, key_states, value_states - ) - else: - if should_use_xetla_mm_qkv(self, device): - if not hasattr(self, "qkv_proj_qweight"): - self.qkv_proj_qweight = fuse_qkv_weight_xetla(self.q_proj, - self.k_proj, - self.v_proj, - self.q_proj.weight.qtype,) - import xe_linear - q_out_len = self.q_proj.out_len - k_out_len = self.k_proj.out_len - v_out_len = self.v_proj.out_len - qkv_states = xe_linear.mm_xetla(hidden_states, - self.qkv_proj_qweight, - self.q_proj.weight.qtype) - query_states = qkv_states[:, :, :q_out_len] - key_states = qkv_states[:, :, q_out_len:q_out_len + k_out_len] - value_states = qkv_states[:, :, q_out_len + k_out_len:] - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - invalidInputError(False, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} for " - "auto-regressive decodingwith k/v caching, please make sure " - "to initialize the attention class with a layer index.") - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids, "llama" + ) - if use_fuse_rope: - import xe_addons - xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, - query_states, key_states) + if past_key_value is not None: + if use_compresskv: + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, q_len) + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, 256) else: - if cache_position is not None: - # for transformers 4.38.0 - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama2") - else: - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama") - - if past_key_value is not None: - if use_compresskv: - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_idx, - query_states, attention_mask, self.num_key_value_groups, - self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) - else: - # update the number of seen tokens - if self.layer_idx == 0: - past_key_value.seen_tokens += key_states.shape[-2] - - # reuse k, v, self_attention - # update `past_key_value` with `key_states` and `value_states` for layer `layer_idx` - if len(past_key_value.key_cache) <= self.layer_idx: - past_key_value.key_cache.append(key_states) - past_key_value.value_cache.append(value_states) - else: - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - - if not enough_kv_room: - # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - self.num_key_value_heads, # Support GQA - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - - new_c_k[:] = cache_k - new_c_v[:] = cache_v - cache_k = new_c_k - cache_v = new_c_v - - key_states, value_states = append_kv_cache(cache_k, - cache_v, - key_states, - value_states) - - # update past_key_value - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - - if cache_position is not None: - new_attention_mask = attention_mask[:, :, kv_seq_len - q_len:kv_seq_len, 0:kv_seq_len] - else: - new_attention_mask = attention_mask + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, None) - if not self.training and not hidden_states.requires_grad and \ - use_flash_attention(query_states, key_states, new_attention_mask): - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - # now only use flash attention for first token - attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), - key_states.to(device, dtype=torch.float16), - value_states.to(device, dtype=torch.float16), - is_causal=True) - attn_weights = None - elif not self.training and not hidden_states.requires_grad and \ - use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): + attn_weights = None + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons + # [CompressKV] if use_compresskv: - # [CompressKV] set attention_mask = None - new_attention_mask = None - attn_output = xe_addons.sdp(query_states, key_states, value_states, - new_attention_mask) - attn_output = attn_output.view(query_states.shape) - attn_weights = None - else: - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - # otherwise, use native attention - if query_states.device.type == "xpu": - attn_output, attn_weights = native_sdp(query_states, key_states, value_states, - new_attention_mask, cache_position, - bsz, q_len, kv_seq_len, - self.head_dim, self.num_heads, output_attentions) - else: - # CPU path - if not output_attentions: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=new_attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with - # AttentionMaskConverter.to_causal_4d that - # does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and new_attention_mask is None and q_len > 1, - ) - else: - attn_output, attn_weights = native_sdp(query_states, key_states, value_states, - new_attention_mask, cache_position, - bsz, q_len, kv_seq_len, - self.head_dim, - self.num_heads, output_attentions) - - attn_output_size = (bsz, self.num_heads, q_len, self.head_dim) - if attn_output.size() != attn_output_size: - invalidInputError(False, - f"`attn_output` should be of size {attn_output_size}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, - dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) - for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output.to(original_dtype), attn_weights, past_key_value - - -def minicpm_attention_forward_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[List[torch.FloatTensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. " - "Please make sure use `attention_mask` instead.`" - ) - - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) - no_tp = not self.config.pretraining_tp > 1 - decoding_fast_path = use_decoding_fast_path(self.q_proj, - use_fuse_rope, - enough_kv_room, - bsz * q_len, - llama_decoding_fast_path_qtype_check) and no_tp - if decoding_fast_path: - hidden_states = hidden_states.view(1, -1) - tmp_cache_k, tmp_cache_v = init_kv_cache( - bsz, - self.num_key_value_heads, - self.head_dim, - 0, - 1, - dtype=hidden_states.dtype, - device=device - ) - import xe_linear - query_states, key_states, value_states = xe_linear.forward_qkv(hidden_states, - self.q_proj.weight, - self.k_proj.weight, - self.v_proj.weight, - position_ids, - tmp_cache_k, tmp_cache_v, - self.q_proj.weight.qtype, - self.v_proj.weight.qtype, - 0, - self.head_dim, - self.rotary_emb.base,) - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - invalidInputError( - False, - f"The cache structure has changed since version v4.36." - f" If you are using {self.__class__.__name__} " - f"for auto-regressive decoding with k/v caching," - f" please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - if use_fuse_rope: - import xe_addons - xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, - query_states, key_states) + if use_quantizekv: + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) else: - if cache_position is not None: - # for transformers 4.38.0 - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama2") - else: - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama") - kv_seq_len = key_states.shape[-2] - - if len(past_key_value.key_cache) <= self.layer_idx: - repeated_key_states = repeat_kv(key_states, self.num_key_value_groups) - repeated_value_states = repeat_kv(value_states, self.num_key_value_groups) - if should_split_qkv_tensor(query_states, bsz, self.num_heads, - q_len, kv_seq_len, output_attentions): - attn_output, _ = native_sdp_split_qkv_tensor(query_states, repeated_key_states, - repeated_value_states, - attention_mask, cache_position, - bsz, q_len, kv_seq_len, self.head_dim, - self.num_heads) + attn_output = xe_addons.sdp(query_states, key_states, value_states, + attention_mask) + elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): + import xe_addons + if use_quantizekv: + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) else: - attn_weights = torch.matmul(query_states, repeated_key_states - .transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - invalidInputError( - False, - f"Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if cache_position is not None: - # for transformers 4.38.0 - causal_mask = attention_mask[:, :, cache_position, : kv_seq_len] - attn_weights = attn_weights + causal_mask - else: - attn_mask_size = (bsz, 1, q_len, kv_seq_len) - if attention_mask.size() != attn_mask_size: - invalidInputError(False, - f"Attention mask should be of size {attn_mask_size}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - - if kv_seq_len >= 2048 or bsz >= 64: - # for memory considerations, do not upcast attention to fp32 - # for long sequences or large batches - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - else: - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, repeated_value_states) - if use_cache: - cache_kwargs = None - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) else: - cache_kwargs = None # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) - kv_seq_len = key_states.shape[-2] - if not use_sdp_fp8(q_len, key_states.shape[2], query_states): + if use_quantizekv: key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) - key_states = repeat_kv(key_states, self.num_key_value_groups)\ - .to(device, dtype=query_states.dtype) - value_states = repeat_kv(value_states, self.num_key_value_groups)\ - .to(device, dtype=query_states.dtype) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - attn_weights = attn_weights / math.sqrt(self.head_dim) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - invalidInputError( - False, - f"Attention weights should be of size" - f" {(bsz, self.num_heads, q_len, kv_seq_len)}," - f" but is {attn_weights.size()}" - ) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) - if attention_mask is not None: - if cache_position is not None: - # for transformers 4.38.0 - causal_mask = attention_mask[:, :, cache_position, : kv_seq_len] - attn_weights = attn_weights + causal_mask - else: - attn_mask_size = (bsz, 1, q_len, kv_seq_len) - if attention_mask.size() != attn_mask_size: - invalidInputError(False, - f"Attention mask should be of size {attn_mask_size}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) - if kv_seq_len >= 2048 or bsz >= 64: - # for memory considerations, do not upcast attention to fp32 - # for long sequences or large batches - attn_weights = nn.functional.softmax(attn_weights, dim=-1) - else: - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - else: - import xe_addons - if cache_position is not None: - new_attn_mask = attention_mask[:, :, kv_seq_len-q_len:kv_seq_len, 0:kv_seq_len] - else: - new_attn_mask = attention_mask - attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, new_attn_mask) - attn_weights = None + if attention_mask is not None: + attn_weights = attn_weights + attention_mask - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - invalidInputError( - False, - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}," - f" but is {attn_output.size()}" + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.attention_dropout, training=self.training ) + attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size - // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], - o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -605,506 +163,54 @@ def minicpm_attention_forward_quantized( return attn_output, attn_weights, past_key_value -def minicpm_model_forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, -) -> Union[Tuple, BaseModelOutputWithPast]: - from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicCompressCache - use_cache = use_cache if use_cache is not None else self.config.use_cache - input = input_ids if input_ids is not None else inputs_embeds - if use_cache: - if use_quantize_kv_cache(self.layers[0].mlp.up_proj, input, - self.config.num_attention_heads // - self.config.num_key_value_heads): - if not isinstance(past_key_values, DynamicFp8Cache): - past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif should_use_compresskv(input, input.shape[-1]): - if not isinstance(past_key_values, DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) - - return minicpm_model_forward_internal( - self=self, - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -def minicpm_model_forward_internal( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, -) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None \ - else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None - else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - invalidInputError(False, - "You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape[:2] - elif inputs_embeds is not None: - batch_size, seq_length = inputs_embeds.shape[:2] - else: - invalidInputError(False, - "You have to specify either input_ids or inputs_embeds") - - if self.gradient_checkpointing and self.training: +def minicpm_model_forward_wrapper(origin_forward): + def minicpm_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + # ipex-llm changes start + # IPEX-LLM OPT: kv cache and quantize kv cache + inputs = input_ids if input_ids is not None else inputs_embeds + use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, + self.config.num_attention_heads // + self.config.num_key_value_heads) + use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) + + use_cache = use_cache if use_cache is not None else self.config.use_cache if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing." - " Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, - dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb - - if self._use_flash_attention_2: - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask)\ - else None - elif self._use_sdpa and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length - ) - - # embed positions - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - # bigdl-llm changes: - curr_device = decoder_layer.input_layernorm.weight.device - if attention_mask is not None: - attention_mask = attention_mask.to(curr_device) - if position_ids is not None: - position_ids = position_ids.to(curr_device) - # bigdl-llm changes end - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache \ - else next_decoder_cache - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] - if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -def minicpm_attention_forward_original_4_39( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[List[torch.FloatTensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: - from ipex_llm.transformers.kv import DynamicCompressCache - - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. " - "Please make sure use `attention_mask` instead.`" - ) - - bsz, q_len, hidden_size = hidden_states.size() - device = hidden_states.device - # for flash attention - original_dtype = hidden_states.dtype - - # [CompressKV] - use_compresskv = isinstance(past_key_value, DynamicCompressCache) - - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) - no_tp = not self.config.pretraining_tp > 1 - decoding_fast_path = use_decoding_fast_path(self.q_proj, - use_fuse_rope, - enough_kv_room, - bsz * q_len, - llama_decoding_fast_path_qtype_check) and no_tp - - # single batch decoding fast path - # forward_qkv takes will perform QKV projection, rotary position embedding - # and save the key/value states to cache, then return query states and the - # extended key/value cache - if decoding_fast_path: - hidden_states = hidden_states.view(1, -1) - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - kv_seq_len = cache_k.shape[-2] - import xe_linear - query_states, key_states, value_states = xe_linear.forward_qkv(hidden_states, - self.q_proj.weight, - self.k_proj.weight, - self.v_proj.weight, - position_ids, - cache_k, cache_v, - self.q_proj.weight.qtype, - self.v_proj.weight.qtype, - kv_seq_len, - self.head_dim, - self.rotary_emb.base,) - kv_seq_len += 1 - # update past_key_value's seem_tokens and kv caches. - # [CompressKV] - if use_compresskv: - past_key_value.update_seen_tokens(self.layer_idx, q_len) - kv_seq_len = past_key_value.get_seq_length() - elif self.layer_idx == 0: - past_key_value._seen_tokens = kv_seq_len - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - - else: - if self.config.pretraining_tp > 1: - key_value_slicing = ((self.num_key_value_heads * self.head_dim) // - self.config.pretraining_tp) - query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) - // self.config.pretraining_tp, dim=0) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) - for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) - for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) - for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - else: - if fp16_fusion_check(self.q_proj, hidden_states, self.training) and \ - hidden_size == 4096 and self.q_proj.out_features == self.k_proj.out_features: - # only use mm_qkv_out on pvc for llama-7b - if not hasattr(self, "qkv_proj_weight"): - self.qkv_proj_weight = torch.stack([self.q_proj.weight, - self.k_proj.weight, - self.v_proj.weight]).contiguous() - self.q_proj.weight.data = self.qkv_proj_weight[0, :, :] - self.k_proj.weight.data = self.qkv_proj_weight[1, :, :] - self.v_proj.weight.data = self.qkv_proj_weight[2, :, :] - torch.xpu.empty_cache() - query_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - key_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - value_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], - dtype=hidden_states.dtype, device=hidden_states.device) - torch.ops.torch_ipex.mm_qkv_out( - hidden_states, self.qkv_proj_weight, None, - query_states, key_states, value_states - ) - else: - if should_use_xetla_mm_qkv(self, device): - if not hasattr(self, "qkv_proj_qweight"): - self.qkv_proj_qweight = fuse_qkv_weight_xetla(self.q_proj, - self.k_proj, - self.v_proj, - self.q_proj.weight.qtype,) - import xe_linear - q_out_len = self.q_proj.out_len - k_out_len = self.k_proj.out_len - v_out_len = self.v_proj.out_len - qkv_states = xe_linear.mm_xetla(hidden_states, - self.qkv_proj_qweight, - self.q_proj.weight.qtype) - query_states = qkv_states[:, :, :q_out_len] - key_states = qkv_states[:, :, q_out_len:q_out_len + k_out_len] - value_states = qkv_states[:, :, q_out_len + k_out_len:] - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - invalidInputError(False, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} for " - "auto-regressive decodingwith k/v caching, please make sure " - "to initialize the attention class with a layer index.") - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - if use_fuse_rope: - import xe_addons - xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, - query_states, key_states) - else: - if cache_position is not None: - # for transformers 4.38.0 - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama2") - else: - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "llama") - - if past_key_value is not None: - if use_compresskv: - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_idx, - query_states, attention_mask, self.num_key_value_groups, - self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) - else: - # update the number of seen tokens - if self.layer_idx == 0: - past_key_value._seen_tokens += key_states.shape[-2] - - # reuse k, v, self_attention - # update `past_key_value` with `key_states` and `value_states` for layer `layer_idx` - if len(past_key_value.key_cache) <= self.layer_idx: - past_key_value.key_cache.append(key_states) - past_key_value.value_cache.append(value_states) + if use_compress_kv and not isinstance(past_key_values, + DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) else: - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - - if not enough_kv_room: - # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - self.num_key_value_heads, # Support GQA - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - - new_c_k[:] = cache_k - new_c_v[:] = cache_v - cache_k = new_c_k - cache_v = new_c_v - - key_states, value_states = append_kv_cache(cache_k, - cache_v, - key_states, - value_states) - - # update past_key_value - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - - if cache_position is not None: - new_attention_mask = attention_mask[:, :, kv_seq_len - q_len:kv_seq_len, 0:kv_seq_len] - else: - new_attention_mask = attention_mask - - if not self.training and not hidden_states.requires_grad and \ - use_flash_attention(query_states, key_states, new_attention_mask): - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - # now only use flash attention for first token - attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), - key_states.to(device, dtype=torch.float16), - value_states.to(device, dtype=torch.float16), - is_causal=True) - attn_weights = None - elif not self.training and not hidden_states.requires_grad and \ - use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): - import xe_addons - if use_compresskv: - # [CompressKV] set attention_mask = None - new_attention_mask = None - attn_output = xe_addons.sdp(query_states, key_states, value_states, - new_attention_mask) - attn_output = attn_output.view(query_states.shape) - attn_weights = None - else: - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - # otherwise, use native attention - if query_states.device.type == "xpu": - attn_output, attn_weights = native_sdp(query_states, key_states, value_states, - new_attention_mask, cache_position, - bsz, q_len, kv_seq_len, - self.head_dim, self.num_heads, output_attentions) - else: - # CPU path - if not output_attentions: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=new_attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with - # AttentionMaskConverter.to_causal_4d that - # does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and new_attention_mask is None and q_len > 1, - ) - else: - attn_output, attn_weights = native_sdp(query_states, key_states, value_states, - new_attention_mask, cache_position, - bsz, q_len, kv_seq_len, - self.head_dim, - self.num_heads, output_attentions) - - attn_output_size = (bsz, self.num_heads, q_len, self.head_dim) - if attn_output.size() != attn_output_size: - invalidInputError(False, - f"`attn_output` should be of size {attn_output_size}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, - dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) - for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output.to(original_dtype), attn_weights, past_key_value - + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + elif (use_quantize_kv and not use_compress_kv + and not isinstance(past_key_values, DynamicFp8Cache)): + past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + elif (not use_quantize_kv and not use_compress_kv + and not isinstance(past_key_values, DynamicNormalCache)): + past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) + # ipex-llm changes end + return origin_forward( + self=self, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) -def minicpm_attention_forward_4_39( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[List[torch.FloatTensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: - if use_quantize_kv_cache(self.q_proj, hidden_states, self.num_key_value_groups): - forward_function = minicpm_attention_forward_quantized - else: - forward_function = minicpm_attention_forward_original_4_39 - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - kwargs=kwargs - ) + return minicpm_model_forward diff --git a/python/llm/src/ipex_llm/transformers/models/minicpmv.py b/python/llm/src/ipex_llm/transformers/models/minicpmv.py index 340285ed193..89aca6d0126 100644 --- a/python/llm/src/ipex_llm/transformers/models/minicpmv.py +++ b/python/llm/src/ipex_llm/transformers/models/minicpmv.py @@ -13,37 +13,249 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# Some parts of this file is adapted from +# https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/modeling_minicpmv.py +# which is licensed under Apache License 2.0: +# +# https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE +# -def minicpmv_generate_wrapper(origin_generate): - def generate( +import math +import torch +from threading import Thread +from typing import Optional, List +from torch.nn.functional import linear +from ipex_llm.transformers.models.common import merge_qkv_base +from ipex_llm.transformers.models.common import attention_softmax +from transformers import AutoProcessor, TextIteratorStreamer +from transformers.generation.logits_process import RepetitionPenaltyLogitsProcessor + + +# MiniCPM-V-2_5 and MiniCPM-V-2_6 +def merge_qkv(module: torch.nn.Module): + merge_qkv_base(module, "SiglipAttention") + merge_qkv_base(module, "Idefics2VisionAttention") + + +# MiniCPM-V-2_5 and MiniCPM-V-2_6 +def siglip_attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, +): + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) + qkv = qkv.transpose(1, 2) + query_states, key_states, value_states = qkv.chunk(3, dim=1) + + attn_weights = torch.matmul(query_states * self.scale, key_states.transpose(2, 3)) + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = attention_softmax(attn_weights, self.training) + + attn_weights = torch.nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# MiniCPM-V-2_6 +def _in_projection_packed( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + w: torch.Tensor, + b: Optional[torch.Tensor] = None, +) -> List[torch.Tensor]: + E = q.size(-1) + if k is v: + if q is k: + # self-attention + proj = linear(q, w, b) + # reshape to 3, E and not E, 3 is deliberate for + # better memory coalescing and keeping same order as chunk() + proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2) + proj = proj.contiguous() + return proj[0], proj[1], proj[2] + else: + # encoder-decoder attention + w_q, w_kv = w.split([E, E * 2]) + if b is None: + b_q = b_kv = None + else: + b_q, b_kv = b.split([E, E * 2]) + q_proj = linear(q, w_q, b_q) + kv_proj = linear(k, w_kv, b_kv) + # reshape to 2, E and not E, 2 is deliberate for + # better memory coalescing and keeping same order as chunk() + kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2) + kv_proj = kv_proj.contiguous() + return (q_proj, kv_proj[0], kv_proj[1]) + else: + w_q, w_k, w_v = w.chunk(3) + # ipex-llm changes start: add contiguous to workaround a ipex bug + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + w_q = w_q.contiguous() + w_k = w_k.contiguous() + w_v = w_v.contiguous() + # ipex-llm changes end + if b is None: + b_q = b_k = b_v = None + else: + b_q, b_k, b_v = b.chunk(3) + return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) + + +# for minicpm-v-2_6 benchmarking purposes +def minicpmv_decode_stream_wrapper(origin_decode_stream): + def minicpv_decode_stream( + self, + inputs_embeds, + tokenizer, + **kwargs + ): + streamer = kwargs.get('streamer', None) + if streamer is not None: + terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] + generation_kwargs = { + 'inputs_embeds': inputs_embeds, + 'pad_token_id': 0, + 'eos_token_id': terminators, + } + generation_kwargs.update(kwargs) + + thread = Thread(target=self.llm.generate, kwargs=generation_kwargs) + thread.start() + + return streamer + else: + return origin_decode_stream( + self=self, + inputs_embeds=inputs_embeds, + tokenizer=tokenizer, + **kwargs + ) + return minicpv_decode_stream + + +# MiniCPM-V-2 +# modified from timm.models.vision_transformer.Attention.forward +def vision_transformer_attention_forward(self, x: torch.Tensor) -> torch.Tensor: + bsz, q_len, hidden_size = x.size() + + qkv = self.qkv(x) + qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) + qkv = qkv.transpose(1, 2) + query_states, key_states, value_states = qkv.chunk(3, dim=1) + + attn_weights = torch.matmul(query_states * self.scale, key_states.transpose(2, 3)) + attn_weights = attention_softmax(attn_weights, self.training) + attn_weights = self.attn_drop(attn_weights) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, hidden_size) + + attn_output = self.proj(attn_output) + attn_output = self.proj_drop(attn_output) + return attn_output + + +# MiniCPM-V-2_5 +def minicpmv_chat_wrapper(origin_chat): + def minicpmv_chat( self, - input_ids=None, - pixel_values=None, - tgt_sizes=None, - image_bound=None, - attention_mask=None, - tokenizer=None, + image, + msgs, + tokenizer, + processor=None, vision_hidden_states=None, - return_vision_hidden_states=False, + max_new_tokens=1024, + sampling=True, + max_inp_length=2048, + system_prompt='', stream=False, - decode_text=False, **kwargs ): - if kwargs.get("repetition_penalty", None) is not None: - kwargs["repetition_penalty"] = 1 - return origin_generate( + if processor is None: + if getattr(self, "processor", None) is None: + self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, + trust_remote_code=True) + processor = self.processor + return origin_chat( self=self, - input_ids=input_ids, - pixel_values=pixel_values, - tgt_sizes=tgt_sizes, - image_bound=image_bound, - attention_mask=attention_mask, + image=image, + msgs=msgs, tokenizer=tokenizer, + processor=processor, vision_hidden_states=vision_hidden_states, - return_vision_hidden_states=return_vision_hidden_states, + max_new_tokens=max_new_tokens, + sampling=sampling, + max_inp_length=max_inp_length, + system_prompt=system_prompt, stream=stream, - decode_text=decode_text, **kwargs ) + return minicpmv_chat + + +# MiniCPM-V-2 +def minicpmv_get_vision_embedding(self, pixel_values): + res = [] + dtype = self.dtype + + def process_each_pixel(pixel_value, dtype, config, vpm, resampler): + H, W = pixel_value.shape[-2:] + target_size = (math.ceil(H / config.patch_size), math.ceil(W / config.patch_size)) + vision_embedding = self.vpm_forward_features(pixel_value.unsqueeze(0).type(dtype)) + + if hasattr(vpm, 'num_prefix_tokens') and vpm.num_prefix_tokens > 0: + vision_embedding = vision_embedding[:, vpm.num_prefix_tokens:] + return resampler(vision_embedding, target_size) + + for pixel_value in pixel_values: + result = process_each_pixel(pixel_value, dtype, self.config, self.vpm, self.resampler) + res.append(result) + return torch.vstack(res) + + +def patched_repetition_penalty_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): + if scores.device.type == "xpu": + import xe_addons + xe_addons.repetition_penalty_logits_process_inplaced(scores, input_ids, self.penalty) + else: + score = torch.gather(scores, 1, input_ids) + score = torch.where(score < 0, score * self.penalty, score / self.penalty) + scores.scatter_(1, input_ids, score) + return scores + + +def minicpmv_generate_wrapper(origin_generate): + def generate( + *inputs, + **kwargs + ): + RepetitionPenaltyLogitsProcessor.__call__ = patched_repetition_penalty_call + + # for minicpm-v-2_6 benchmarking purposes + stream = kwargs.get("stream", False) + if isinstance(stream, TextIteratorStreamer): + kwargs.update({'streamer': stream}) + + return origin_generate( + *inputs, + **kwargs, + ) return generate diff --git a/python/llm/src/ipex_llm/transformers/models/mistral.py b/python/llm/src/ipex_llm/transformers/models/mistral.py index 35d7abae38f..2694031d586 100644 --- a/python/llm/src/ipex_llm/transformers/models/mistral.py +++ b/python/llm/src/ipex_llm/transformers/models/mistral.py @@ -46,9 +46,9 @@ from ipex_llm.utils.common import invalidInputError from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache, should_use_compresskv -from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \ - apply_rotary_pos_emb_no_cache_xpu + restore_fp8_kv_cache, use_quantize_kv_cache, should_use_compresskv, \ + get_compresskv_attn_mask +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ is_enough_kv_cache_room_4_36 from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS @@ -63,6 +63,7 @@ except ImportError: Cache = Tuple[torch.Tensor] + import os KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) @@ -128,11 +129,11 @@ def compute_attn_outputs_weights(query_states, key_states, value_states, bsz, q_ ) attn_weights = attn_weights + attention_mask - - if kv_seq_len >= 2048 or bsz >= 64: - # for memory considerations, do not upcast attention to fp32 - # for long sequences or large batches - attn_weights = nn.functional.softmax(attn_weights, dim=-1) + if os.getenv("IPEX_LLM_LOW_MEM", '0').lower() in ('true', '1', 't'): + if kv_seq_len >= 2048 or bsz >= 64: + # for memory considerations, do not upcast attention to fp32 + # for long sequences or large batches + attn_weights = nn.functional.softmax(attn_weights, dim=-1) else: # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, @@ -210,7 +211,7 @@ def mistral_model_forward_4_36( self.config.num_attention_heads//self.config.num_key_value_heads): if not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif should_use_compresskv(input_ids, input_ids.shape[-1]): + elif should_use_compresskv(input_ids, input_ids.shape[1]): # if use quantize kv, compress kv will be ignored now if not isinstance(past_key_values, DynamicCompressCache): past_key_values = DynamicCompressCache.from_legacy_cache( @@ -271,6 +272,7 @@ def mistral_attention_forward_quantized( original_dtype = hidden_states.dtype use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) + enough_kv_room = is_enough_kv_cache_room_4_31(past_key_value) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, @@ -298,7 +300,8 @@ def mistral_attention_forward_quantized( self.q_proj.weight.qtype, self.v_proj.weight.qtype, 0, - self.head_dim) + self.head_dim, + self.rotary_emb.base) else: query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) @@ -315,10 +318,9 @@ def mistral_attention_forward_quantized( kv_seq_len += past_key_value[0].shape[-2] if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral") + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, @@ -475,6 +477,7 @@ def mistral_attention_forward_original( original_dtype = hidden_states.dtype use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) + enough_kv_room = is_enough_kv_cache_room_4_31(past_key_value) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, @@ -496,7 +499,8 @@ def mistral_attention_forward_original( self.q_proj.weight.qtype, self.v_proj.weight.qtype, kv_seq_len, - self.head_dim) + self.head_dim, + self.rotary_emb.base) kv_seq_len += 1 else: @@ -532,10 +536,9 @@ def mistral_attention_forward_original( kv_seq_len += past_key_value[0].shape[-2] if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral") + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, @@ -697,7 +700,9 @@ def mistral_attention_forward_4_36_quantized( original_dtype = hidden_states.dtype use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) + + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, + seq_len=q_len) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, enough_kv_room, @@ -724,7 +729,8 @@ def mistral_attention_forward_4_36_quantized( self.q_proj.weight.qtype, self.v_proj.weight.qtype, 0, - self.head_dim) + self.head_dim, + self.rotary_emb.base) else: query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) @@ -750,10 +756,9 @@ def mistral_attention_forward_4_36_quantized( kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral") + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, @@ -822,7 +827,6 @@ def mistral_attention_forward_4_36_quantized( f" but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask - if kv_seq_len >= 2048 or bsz >= 64: # for memory considerations, do not upcast attention to fp32 # for long sequences or large batches @@ -913,7 +917,10 @@ def mistral_attention_forward_4_36_original( use_compresskv = isinstance(past_key_value, DynamicCompressCache) use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) + + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, + self.layer_idx, + q_len) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, enough_kv_room, @@ -938,7 +945,8 @@ def mistral_attention_forward_4_36_original( self.q_proj.weight.qtype, self.v_proj.weight.qtype, kv_seq_len, - self.head_dim) + self.head_dim, + self.rotary_emb.base) kv_seq_len += 1 # update past_key_value's seem_tokens and kv caches. @@ -991,10 +999,9 @@ def mistral_attention_forward_4_36_original( kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral") + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, @@ -1076,9 +1083,9 @@ def mistral_attention_forward_4_36_original( elif use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): # new fp16 sdp doesn't require repeat_kv import xe_addons - # [CompressKV] set attention_mask = None + # [CompressKV] if use_compresskv: - attention_mask = None + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) attn_output = xe_addons.sdp(query_states, key_states, value_states, attention_mask) attn_output = attn_output.view(query_states.shape) attn_weights = None @@ -1168,7 +1175,9 @@ def mistral_attention_forward_4_39_original( use_compresskv = isinstance(past_key_value, DynamicCompressCache) use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) + + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, + q_len) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, enough_kv_room, @@ -1193,7 +1202,8 @@ def mistral_attention_forward_4_39_original( self.q_proj.weight.qtype, self.v_proj.weight.qtype, kv_seq_len, - self.head_dim) + self.head_dim, + self.rotary_emb.base) kv_seq_len += 1 # update past_key_value's seem_tokens and kv caches. @@ -1245,10 +1255,9 @@ def mistral_attention_forward_4_39_original( kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral") + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, @@ -1322,9 +1331,9 @@ def mistral_attention_forward_4_39_original( elif use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): # new fp16 sdp doesn't require repeat_kv import xe_addons - # [CompressKV] set attention_mask = None + # [CompressKV] if use_compresskv: - attention_mask = None + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) attn_output = xe_addons.sdp(query_states, key_states, value_states, attention_mask) attn_output = attn_output.view(query_states.shape) attn_weights = None diff --git a/python/llm/src/ipex_llm/transformers/models/phi3.py b/python/llm/src/ipex_llm/transformers/models/phi3.py index 6378b6fe348..fa6c43d6d47 100644 --- a/python/llm/src/ipex_llm/transformers/models/phi3.py +++ b/python/llm/src/ipex_llm/transformers/models/phi3.py @@ -31,20 +31,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import math import torch import warnings from torch import nn +from ipex_llm.transformers.models.common import attention_softmax from ipex_llm.transformers.models.utils import should_use_fuse_rope, rotate_half from ipex_llm.transformers.models.utils import mlp_fusion_check, SILU -from ipex_llm.transformers.models.utils import use_sdp, use_sdp_causal +from ipex_llm.transformers.models.utils import use_sdp, use_sdp_causal, get_compresskv_attn_mask from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache -from ipex_llm.transformers.kv import DynamicNormalCache, DynamicFp8Cache +from ipex_llm.transformers.models.utils import should_use_compresskv, is_enough_kv_cache_room_4_36 +from ipex_llm.transformers.kv import DynamicNormalCache, DynamicFp8Cache, \ + DynamicCompressCache, DynamicCompressFp8Cache from typing import Optional, Tuple, List from transformers.models.phi.modeling_phi import repeat_kv from transformers.cache_utils import Cache +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): @@ -94,6 +99,10 @@ def attention_forward( bsz, q_len, _ = hidden_states.size() + # [CompressKV] + use_compresskv = isinstance(past_key_value, DynamicCompressCache) + use_quantizekv = isinstance(past_key_value, DynamicFp8Cache) + qkv = self.qkv_proj(hidden_states) qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) qkv = qkv.transpose(1, 2) @@ -127,43 +136,57 @@ def attention_forward( cos, sin, position_ids) if past_key_value is not None: - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, None) + # [CompressKV] + if use_compresskv: + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, + self.layer_idx, + q_len) + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, + query_states, attention_mask, self.num_key_value_groups, + self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH) + else: + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, None) if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): + # [CompressKV] + if use_compresskv: + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) import xe_addons - if isinstance(past_key_value, DynamicFp8Cache): + if use_quantizekv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, attention_mask) else: attn_output = xe_addons.sdp(query_states, key_states, value_states, attention_mask) - # disable sdp_causal to avoid overflow for now - # elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): - # import xe_addons - # if isinstance(past_key_value, DynamicFp8Cache): - # attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, - # value_states, attention_mask) - # else: - # attn_output = xe_addons.sdp_causal(query_states, key_states, - # value_states, attention_mask) - else: + elif ( + use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training) + and os.environ.get("IPEX_LLM_LOW_MEM", "0") == "1" + ): + import xe_addons if isinstance(past_key_value, DynamicFp8Cache): + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) + else: + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) + else: + if use_quantizekv: key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + # use inplaced div, add and softmax to avoid double attn_weights memory usage + attn_weights.div_(math.sqrt(self.head_dim)) if attention_mask is not None: - attn_weights = attn_weights + attention_mask + attn_weights.add_(attention_mask) + attn_weights = attention_softmax(attn_weights, self.training) - # upcast attention to fp32 - attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(value_states.dtype) attn_weights = torch.nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) attn_output = torch.matmul(attn_weights, value_states) @@ -233,13 +256,31 @@ def model_forward( ): # IPEX-LLM OPT: kv cache and quantize kv cache and sdp use_cache = use_cache if use_cache is not None else self.config.use_cache - input = input_ids if input_ids is not None else inputs_embeds - use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input) + inputs = input_ids if input_ids is not None else inputs_embeds + use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs) + use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) if use_cache: - if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache): + if use_compress_kv and not isinstance(past_key_values, + DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + if use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, + DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - if not use_quantize_kv and not isinstance(past_key_values, DynamicNormalCache): + if not use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, + DynamicNormalCache): past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) + if past_key_values.get_seq_length() == 0: + n_layer = self.config.num_hidden_layers + n_head = self.config.num_attention_heads + head_dim = self.config.hidden_size // self.config.num_attention_heads + past_key_values = DynamicNormalCache.from_reserved( + n_layer, inputs.size(0), n_head, inputs.size(1), head_dim, + self.dtype, inputs.device + ) return origin_model_forward( self=self, input_ids=input_ids, diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py index 0306bb94f4e..802c5e7ec45 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen2.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py @@ -45,18 +45,18 @@ from torch.nn import CrossEntropyLoss from torch.nn.functional import scaled_dot_product_attention as sdpa +from ipex_llm.transformers.models.common import merge_qkv_base from ipex_llm.transformers.models.utils import SILU, mlp_fusion_check from ipex_llm.transformers.models.utils import should_use_fuse_rope from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache, \ - should_use_compresskv, is_enough_kv_cache_room_4_36 + should_use_compresskv, is_enough_kv_cache_room_4_36, get_compresskv_attn_mask from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_causal -from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, DynamicCompressCache +from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \ + DynamicCompressCache, DynamicCompressFp8Cache from ipex_llm.utils.common import invalidInputError from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb, repeat_kv -from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa -from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.cache_utils import Cache from transformers import logging @@ -76,12 +76,15 @@ def qwen2_model_forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, # for transformers >= 4.42 ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else \ - self.config.output_attentions + output_attentions = ( + output_attentions if output_attentions is not None + else self.config.output_attentions + ) output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else - self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -90,8 +93,7 @@ def qwen2_model_forward( # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: invalidInputError(False, - "You cannot specify both decoder_input_ids and " - "decoder_inputs_embeds at the same time") + "You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: @@ -118,17 +120,20 @@ def qwen2_model_forward( and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, self.config.num_attention_heads//self.config.num_key_value_heads) ) - use_compress_kv = should_use_compresskv(inputs, inputs.shape[-1]) + use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) if use_cache: - if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache): + if use_compress_kv and not isinstance(past_key_values, DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + elif use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, + DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) - elif not use_quantize_kv and use_compress_kv and not isinstance(past_key_values, - DynamicCompressCache): - past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) if not use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, - (DynamicNormalCache, - DynamicCompressCache)): + DynamicNormalCache): past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) # ipex-llm changes end @@ -159,6 +164,9 @@ def qwen2_model_forward( "the input. " ) + from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa + from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask + # ipex-llm changes start: don't generate `attention_mask` in specific cases if seq_length == 1 or batch_size == 1 and use_sdp_causal( seq_length, seq_length + past_key_values_length, @@ -259,6 +267,148 @@ def qwen2_model_forward( ) +def qwen2_model_forward_4_42( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + invalidInputError( + (input_ids is None) ^ (inputs_embeds is None), + "You cannot specify both input_ids and inputs_embeds at the same time, " + "and must specify either one" + ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. " + "Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # ipex-llm changes start + # IPEX-LLM OPT: kv cache and quantize kv cache + use_quantize_kv = ( + self.config.hidden_size != 3584 # disable quantize kv in specific model + and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds, + self.config.num_attention_heads//self.config.num_key_value_heads) + ) + use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \ + isinstance(past_key_values, DynamicCompressCache) + + if use_cache: + if use_compress_kv and not isinstance(past_key_values, DynamicCompressCache): + if use_quantize_kv: + past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values) + elif use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, + DynamicFp8Cache): + past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + if not use_quantize_kv and not use_compress_kv and not isinstance(past_key_values, + DynamicNormalCache): + past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) + # ipex-llm changes end + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # ipex-llm changes start: remove `to_legacy_cache` + next_cache = None + if use_cache: + next_cache = next_decoder_cache + # ipex-llm changes end + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, + all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def qwen2_causal_lm_forward( self, input_ids: torch.LongTensor = None, @@ -271,6 +421,7 @@ def qwen2_causal_lm_forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, # for transformers >= 4.42 ) -> Union[Tuple, CausalLMOutputWithPast]: output_attentions = ( output_attentions if output_attentions is not None @@ -293,6 +444,7 @@ def qwen2_causal_lm_forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) hidden_states = outputs[0] @@ -328,30 +480,10 @@ def qwen2_causal_lm_forward( def merge_qkv(module: torch.nn.Module): - if isinstance(module, Qwen2Attention): - new_weight = torch.cat([ - module.q_proj.weight.data, - module.k_proj.weight.data, - module.v_proj.weight.data, - ], dim=0) - new_bias = torch.cat([ - module.q_proj.bias.data, - module.k_proj.bias.data, - module.v_proj.bias.data, - ], dim=-1) - - qkv_proj = torch.nn.Linear(0, 0, bias=True) - qkv_proj.weight = torch.nn.Parameter(new_weight, requires_grad=False) - qkv_proj.bias = torch.nn.Parameter(new_bias, requires_grad=False) - qkv_proj.in_features = new_weight.size(1) - qkv_proj.out_features = new_weight.size(0) - module.qkv_proj = qkv_proj - - del module.q_proj, module.k_proj, module.v_proj - - if os.environ.get("IPEX_LLM_LOW_MEM", None) == "1": - del module.rotary_emb.cos_cached - del module.rotary_emb.sin_cached + merge_qkv_base(module, Qwen2Attention) + if isinstance(module, Qwen2Attention) and os.environ.get("IPEX_LLM_LOW_MEM", None) == "1": + del module.rotary_emb.cos_cached + del module.rotary_emb.sin_cached def padding_mlp(module: torch.nn.Module): @@ -404,6 +536,7 @@ def qwen2_attention_forward( # [CompressKV] from ipex_llm.transformers.kv import DynamicCompressCache use_compresskv = isinstance(past_key_value, DynamicCompressCache) + use_quantizekv = isinstance(past_key_value, DynamicFp8Cache) if hasattr(self, 'qkv_proj') and self.qkv_proj is not None: qkv = self.qkv_proj(hidden_states) @@ -440,7 +573,8 @@ def qwen2_attention_forward( if past_key_value is not None: # [CompressKV] if use_compresskv: - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, + q_len) key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, query_states, attention_mask, self.num_key_value_groups, @@ -471,7 +605,9 @@ def qwen2_attention_forward( is_causal=True).to(hidden_states.dtype) elif use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons - if isinstance(past_key_value, DynamicFp8Cache): + if use_compresskv: + attention_mask = get_compresskv_attn_mask(key_states, attention_mask) + if use_quantizekv: attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, attention_mask) else: @@ -479,14 +615,14 @@ def qwen2_attention_forward( attention_mask) elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): import xe_addons - if isinstance(past_key_value, DynamicFp8Cache): + if use_quantizekv: attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, value_states, attention_mask) else: attn_output = xe_addons.sdp_causal(query_states, key_states, value_states, attention_mask) else: - if isinstance(past_key_value, DynamicFp8Cache): + if use_quantizekv: key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) # repeat k/v heads if n_kv_heads < n_heads diff --git a/python/llm/src/ipex_llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py index 14375dd6a70..1de802967d1 100644 --- a/python/llm/src/ipex_llm/transformers/models/utils.py +++ b/python/llm/src/ipex_llm/transformers/models/utils.py @@ -487,14 +487,25 @@ def update_past_key_value(past_key_value, key_states, value_states, def should_use_compresskv(x: torch.Tensor, prompt_len: int): use_compress_kv = os.environ.get("IPEX_LLM_COMPRESS_KV_CACHE", None) - if use_compress_kv is None: - return ( - get_xpu_device_type(x) == "mtl" - and prompt_len >= 2500 - and prompt_len <= 4500 - ) + perf_mode = os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None) + if perf_mode == "1": + return False else: - return x.device.type == 'xpu' and use_compress_kv == "1" + if use_compress_kv is None: + return ( + get_xpu_device_type(x) == "mtl" + and prompt_len >= 1800 + and prompt_len <= 4500 + ) + else: + return x.device.type == 'xpu' and use_compress_kv == "1" + + +def get_compresskv_attn_mask(key_states: torch.Tensor, + attention_mask: torch.Tensor): + if attention_mask is not None: + attention_mask = attention_mask[:, :, :, -key_states.size(2):] + return attention_mask def get_q_proj_or_qkv_proj(self): diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 2a3ecffcda6..63487dfaf92 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -27,7 +27,7 @@ from ipex_llm.utils.common.log4Error import invalidInputError from ipex_llm.transformers.utils import logger -from ipex_llm.transformers.npu_models.convert import optimize_llm +from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post def patch_flash_attn_import(filename: str) -> List[str]: @@ -38,7 +38,7 @@ def patch_flash_attn_import(filename: str) -> List[str]: return imports -def ignore_argument(kwargs: dict, key: 'str'): +def ignore_argument(kwargs: dict, key: "str"): arg = kwargs.pop(key, None) if arg is not None: warnings.warn(f"argument `{key}={arg}` will be ignored") @@ -46,10 +46,11 @@ def ignore_argument(kwargs: dict, key: 'str'): def save_low_bit(self, model_dir: str, *args, **kwargs): origin_device = self.device - kwargs['safe_serialization'] = False + kwargs["safe_serialization"] = False self.save_pretrained(model_dir, *args, **kwargs) import json import os + # We conveniently save all the keys of the model to have them on hand, # so that when using 'low_cpumem load', # it's not necessary to load the entire model to extract its keys @@ -57,7 +58,7 @@ def save_low_bit(self, model_dir: str, *args, **kwargs): load_keys = {"all_checkpoint_keys": list(self.state_dict().keys())} with open(os.path.join(model_dir, "load_keys.json"), "w") as json_file: json.dump(load_keys, json_file) - if origin_device != 'cpu': + if origin_device != "cpu": self.to(origin_device) @@ -66,9 +67,7 @@ class _BaseAutoModelClass: @classmethod @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import) - def from_pretrained(cls, - *args, - **kwargs): + def from_pretrained(cls, *args, **kwargs): """ Load a model from a directory or the HF Hub. Use load_in_low_bit parameter to convert model to low-bit format, like int4 and int8. @@ -78,25 +77,28 @@ def from_pretrained(cls, :param load_in_low_bit: str value, options are ``'sym_int4'``, ``'sym_int8'``, ``'fp16'``, ``'fp32'``. Relevant low bit optimizations will be applied to the model. + :param optimize_model: boolean value, Whether to further optimize the low_bit llm model. + Default to be ``False``. :return: a model instance """ - if kwargs.get('device_map', None) not in [None, 'cpu', 'auto']: + if kwargs.get("device_map", None) not in [None, "cpu", "auto"]: warnings.warn("`device_map` will be ignored") - kwargs['device_map'] = 'cpu' + kwargs["device_map"] = "cpu" - if kwargs.get('torch_dtype', None) not in [None, 'auto', torch.float]: + if kwargs.get("torch_dtype", None) not in [None, "auto", torch.float, torch.float16]: warnings.warn("`torch_dtype` will be ignored, `torch.float` will be used") - kwargs['torch_dtype'] = torch.float + kwargs["torch_dtype"] = torch.float32 - low_bit = kwargs.pop('load_in_low_bit', 'sym_int4') + low_bit = kwargs.pop("load_in_low_bit", "sym_int4") qtype_map = { - 'sym_int4': "sym_int4_rtn", - 'sym_int8': "sym_int8_rtn", + "sym_int4": "sym_int4_rtn", + "sym_int8": "sym_int8_rtn", } - invalidInputError(low_bit in qtype_map.keys(), - f"unsupported low_bit: {low_bit}, " - f"only {list(qtype_map.keys())} are supported") + invalidInputError( + low_bit in qtype_map.keys(), + f"unsupported low_bit: {low_bit}, " f"only {list(qtype_map.keys())} are supported", + ) qtype = qtype_map[low_bit] kwargs["low_cpu_mem_usage"] = True @@ -110,53 +112,97 @@ def from_pretrained(cls, ignore_argument(kwargs, "mixed_precision") ignore_argument(kwargs, "cpu_embedding") ignore_argument(kwargs, "embedding_qtype") - ignore_argument(kwargs, "optimize_model") - ignore_argument(kwargs, "modules_to_not_convert") + ignore_argument(kwargs, "enable_mp") ignore_argument(kwargs, "quantization_config") ignore_argument(kwargs, "speculative") ignore_argument(kwargs, "pipeline_parallel_stages") + optimize_model = kwargs.pop("optimize_model", False) + max_output_len = kwargs.pop("max_output_len", 1024) + max_prompt_len = kwargs.pop("max_prompt_len", 512) + inter_pp = kwargs.pop("inter_pp", None) + intra_pp = kwargs.pop("intra_pp", None) + transpose_value_cache = kwargs.pop("transpose_value_cache", True) + modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) try: # To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it - kwargs.pop('device_map', None) + kwargs.pop("device_map", None) model = cls.HF_Model.from_pretrained(*args, **kwargs) except NotImplementedError: - logger.info("Failed to load models with `low_cpu_mem_usage` specified, " - "will fall to traditional load method with higher memory consumption.") + logger.info( + "Failed to load models with `low_cpu_mem_usage` specified, " + "will fall to traditional load method with higher memory consumption." + ) _kwargs["low_cpu_mem_usage"] = False model = cls.HF_Model.from_pretrained(*_args, **_kwargs) model.config.update({"bigdl_lcmu_enabled": False}) logger.info(f"Converting model, it may takes up to several minutes ...") - from intel_npu_acceleration_library.compiler import create_npu_kernels - with torch.no_grad(): - optimize_llm(model) - cls.load_convert(qtype, model, 'cpu', *args, **kwargs) - create_npu_kernels(model) - - model = model.eval() - logger.info(f"Finish to convert model") - - model.config.update({"bigdl_transformers_low_bit": qtype}) + if optimize_model: + invalidInputError( + max_prompt_len < max_output_len, + ( + f"max_prompt_len ({max_prompt_len}) should be less" + " than max_output_len ({max_output_len})" + ), + ) + from ipex_llm.transformers.npu_models.convert_mp import optimize_llm, optimize_llm_pre - # add save_low_bit to pretrained model dynamically - model.save_low_bit = types.MethodType(save_low_bit, model) + if hasattr(model, "llm"): + llm = model.llm + else: + llm = model + + with torch.no_grad(): + optimize_llm_pre(model, qtype) + cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs) + create_npu_kernels(llm) + model = model.eval() + logger.info(f"Finish to convert model") + model.config.update({"bigdl_transformers_low_bit": qtype}) + model.share_memory() + + optimize_llm( + llm, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + model.save_low_bit = types.MethodType(save_low_bit, model) + else: + from ipex_llm.transformers.npu_models.convert import optimize_llm + optimize_llm(model) + with torch.no_grad(): + cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs) + if hasattr(model, "llm"): + create_npu_kernels(model.llm) + else: + create_npu_kernels(model) + model = model.eval() + logger.info(f"Finish to convert model") + model.config.update({"bigdl_transformers_low_bit": qtype}) + # add save_low_bit to pretrained model dynamically + model.save_low_bit = types.MethodType(save_low_bit, model) return model @classmethod - def load_convert(cls, q_k, optimize_model, device, *arg, **kwarg): + def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert, *arg, **kwarg): from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear - replace_with_QuantizedLinear(optimize_model, q_k, device=device) + + replace_with_QuantizedLinear(optimize_model, q_k, device=device, + modules_to_not_convert=modules_to_not_convert) @classmethod @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import) def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs): - if kwargs.pop('torch_dtype', None) not in [None, 'auto', torch.float]: + if kwargs.pop("torch_dtype", None) not in [None, "auto", torch.float]: warnings.warn("`torch_dtype` will be ignored, `torch.float` will be used") # ignore following arguments @@ -164,20 +210,31 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) ignore_argument(kwargs, "lightweight_bmm") ignore_argument(kwargs, "cpu_embedding") ignore_argument(kwargs, "embedding_qtype") - ignore_argument(kwargs, "optimize_model") ignore_argument(kwargs, "modules_to_not_convert") ignore_argument(kwargs, "speculative") ignore_argument(kwargs, "pipeline_parallel_stages") + optimize_model = kwargs.pop("optimize_model", False) + max_output_len = kwargs.pop("max_output_len", 1024) + max_prompt_len = kwargs.pop("max_prompt_len", 512) + inter_pp = kwargs.pop("inter_pp", None) + intra_pp = kwargs.pop("intra_pp", None) + transpose_value_cache = kwargs.pop("transpose_value_cache", True) + modules_to_not_convert = kwargs.pop("modules_to_not_convert", []) from transformers.models.auto.configuration_auto import AutoConfig from transformers.modeling_utils import no_init_weights, get_state_dict_dtype - from transformers.dynamic_module_utils import resolve_trust_remote_code, \ - get_class_from_dynamic_module + from transformers.dynamic_module_utils import ( + resolve_trust_remote_code, + get_class_from_dynamic_module, + ) from transformers.models.auto.auto_factory import _get_model_class from transformers.utils.generic import ContextManagers from transformers.generation.configuration_utils import GenerationConfig - from ipex_llm.transformers.utils import extract_local_archive_file, get_local_shard_files, \ - load_state_dict + from ipex_llm.transformers.utils import ( + extract_local_archive_file, + get_local_shard_files, + load_state_dict, + ) from accelerate.big_modeling import init_empty_weights trust_remote_code = kwargs.pop("trust_remote_code", None) @@ -206,14 +263,18 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) qtype = config_dict.pop("bigdl_transformers_low_bit", False) bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True) - invalidInputError(qtype, - "Detect this model is not a low-bit model, Please use from_pretrained" - " with load_in_4bit or load_in_low_bit to get a low-bit model , and " - " serialize the model using save_low_bit first.") + invalidInputError( + qtype, + "Detect this model is not a low-bit model, Please use from_pretrained" + " with load_in_4bit or load_in_low_bit to get a low-bit model , and " + " serialize the model using save_low_bit first.", + ) - invalidInputError(qtype in ["sym_int8_rtn", "sym_int4_rtn"], - f"Unknown bigdl_transformers_low_bit value: {qtype}," - f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") + invalidInputError( + qtype in ["sym_int8_rtn", "sym_int4_rtn"], + f"Unknown bigdl_transformers_low_bit value: {qtype}," + f" expected: sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.", + ) has_remote_code = hasattr(config, "auto_map") and cls.HF_Model.__name__ in config.auto_map has_local_code = type(config) in cls.HF_Model._model_mapping.keys() @@ -233,15 +294,13 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) model_class = _get_model_class(config, cls.HF_Model._model_mapping) resolved_archive_file, is_sharded = extract_local_archive_file( - pretrained_model_name_or_path, - subfolder, - variant) + pretrained_model_name_or_path, subfolder, variant + ) if is_sharded: - resolved_archive_file, sharded_metadata = \ - get_local_shard_files(pretrained_model_name_or_path, - resolved_archive_file, - subfolder=subfolder) + resolved_archive_file, sharded_metadata = get_local_shard_files( + pretrained_model_name_or_path, resolved_archive_file, subfolder=subfolder + ) # set dtype to instantiate the model under: # 1. If torch_dtype is not None, we use that dtype @@ -265,9 +324,11 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) torch_dtype = get_state_dict_dtype(one_state_dict) del one_state_dict # free CPU memory else: - invalidInputError(False, - f'`torch_dtype` can be either `torch.dtype` or `"auto"`,' - 'but received {torch_dtype}') + invalidInputError( + False, + f'`torch_dtype` can be either `torch.dtype` or `"auto"`,' + "but received {torch_dtype}", + ) dtype_orig = model_class._set_default_torch_dtype(torch_dtype) # Pretrained Model @@ -277,8 +338,10 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) if bigdl_lcmu_enabled: with ContextManagers(init_contexts): - if config.architectures is not None and config.architectures[0] in \ - ["ChatGLMModel", "ChatGLMForConditionalGeneration"]: + if config.architectures is not None and config.architectures[0] in [ + "ChatGLMModel", + "ChatGLMForConditionalGeneration", + ]: """ ChatGLMModel uses skip_init by default, which will force modules placed on cpu @@ -294,20 +357,45 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) quant_device = "meta" if bigdl_lcmu_enabled else "cpu" logger.info(f"Converting model, it may takes up to several minutes ...") from intel_npu_acceleration_library.compiler import create_npu_kernels - with torch.no_grad(): - optimize_llm(model) - cls.load_convert(qtype, model, quant_device, *model_args, **kwargs) - create_npu_kernels(model) - model = model.eval() + if optimize_model: + invalidInputError( + max_prompt_len < max_output_len, + ( + f"max_prompt_len ({max_prompt_len}) should be less" + " than max_output_len ({max_output_len})" + ), + ) + from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre + + if hasattr(model, "llm"): + llm = model.llm + else: + llm = model + + with torch.no_grad(): + optimize_llm_pre(model, qtype) + cls.load_convert(qtype, model, quant_device, modules_to_not_convert, + *model_args, **kwargs) + create_npu_kernels(llm) + + else: + from ipex_llm.transformers.npu_models.convert import optimize_llm + optimize_llm(model) + with torch.no_grad(): + cls.load_convert(qtype, model, quant_device, modules_to_not_convert, + *model_args, **kwargs) + create_npu_kernels(model) if is_sharded: loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"] else: import os import json - with open(os.path.join(pretrained_model_name_or_path, - "load_keys.json"), "r") as json_file: + + with open( + os.path.join(pretrained_model_name_or_path, "load_keys.json"), "r" + ) as json_file: loaded_data = json.load(json_file) loaded_state_dict_keys = loaded_data["all_checkpoint_keys"] @@ -356,6 +444,17 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) for param in model.parameters(): param.requires_grad_(False) + if optimize_model: + from ipex_llm.transformers.npu_models.convert_mp import optimize_llm + optimize_llm( + llm, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + return model diff --git a/python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py new file mode 100644 index 00000000000..25cb790db99 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py @@ -0,0 +1,1038 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer +from intel_npu_acceleration_library.backend.factory import NNFactory +from typing import Optional, Sequence, List, Union, Any, Tuple +import numpy as np +import math +from intel_npu_acceleration_library.backend.runtime import set_contiguous, record_function +from intel_npu_acceleration_library.backend.runtime import adapt_output_tensor, _model_cache +from collections import deque +from transformers.cache_utils import Cache +from intel_npu_acceleration_library.backend.bindings import lib as backend_lib +import ctypes +from ipex_llm.utils.common import invalidInputError +from typing import Optional, List, Generator +import uuid +from functools import partial +import torch.nn.functional as F +import torch.nn.parallel +import torch.distributed as dist +from filelock import FileLock + +from transformers.utils import logging + +logger = logging.get_logger(__name__) +import gc +from colorama import Fore, Back, Style +import torch.multiprocessing as mp +from transformers.cache_utils import Cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from ipex_llm.transformers.npu_models.mp_models_base import run_model +from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory + + +class LowBitBaichuanMultiDecoderlayer(LLMBaseNNFactory): + def __init__( + self, + # batch_size: int, + # seq_len: int, + # hidden_size: int, + hidden_shape: Sequence[int], + *shapes, + num_heads: int, + # num_key_value_heads: int, + num_layers: int, + cached_cos, + cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + mode: str = "prefill", + dtype: np.dtype = np.int8, + max_seq_len: int = 1024, + transpose_value: bool = False, + profile: bool = False, + device: str = "NPU", + rms_norm_eps, + intermediate_size, + ): + super().__init__(max_seq_len=max_seq_len, + transpose_value=transpose_value, + dtype=dtype, + profile=profile, + device=device) + self.max_seq_len = max_seq_len + self.intermediate_size = intermediate_size + self.dtype = dtype + self.cached_cos = cached_cos + self.cached_sin = cached_sin + self.batch_size, self.seq_len, self.hidden_size = hidden_shape + self.mode = mode + self.rms_norm_eps = rms_norm_eps + self.transpose_value = transpose_value + self.num_layers = num_layers + + cos = self.constant(self.cached_cos) + self.cos = self.unsqueeze(cos, axis=0) + + sin = self.constant(self.cached_sin) + self.sin = self.unsqueeze(sin, axis=0) + + if mode == "decode": + self.kv_seq_len = self.max_seq_len + 1 + else: + self.kv_seq_len = self.seq_len + + self.num_heads = num_heads + + self.head_dim = self.hidden_size // self.num_heads + + # define input, the order self.parameter matters + input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size)) + + # Self Attention + if mode == "decode": + attention_mask = self.create_input_op((self.batch_size, 1, 1, self.max_seq_len + 1)) + else: + attention_mask = self.create_input_op((self.batch_size, 1, self.seq_len, self.seq_len)) + + position_ids = self.create_input_op((self.batch_size, self.seq_len)) + # self.num_key_value_heads = num_key_value_heads + past_keys = [] + past_values = [] + if mode == "decode": + for i in range(num_layers): + past_key = self.create_cache_op( + (self.batch_size, self.num_heads, self.max_seq_len, self.head_dim) + ) + if transpose_value: + past_value = self.create_cache_op( + (self.batch_size, self.num_heads, self.head_dim, self.max_seq_len) + ) + else: + past_value = self.create_cache_op( + (self.batch_size, self.num_heads, self.max_seq_len, self.head_dim) + ) + past_keys.append(past_key) + past_values.append(past_value) + else: + past_keys = [None] * num_layers + past_values = [None] * num_layers + + if input_layernorm_weights is None: + input_layernorm_weights = [] + post_attn_layernorm_weights = [] + for i in range(num_layers): + input_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + post_attn_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + else: + input_layernorm_weights = [self.constant(w) for w in input_layernorm_weights] + post_attn_layernorm_weights = [self.constant(w) for w in post_attn_layernorm_weights] + + hidden_states = input + + curr_key_values = [] + for i in range(num_layers): + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + past_key=past_keys[i], + past_value=past_values[i], + ) + curr_key_values.append((new_key_states, new_value_states)) + + # define outputs + hidden_states = self.convert_to_fp16(hidden_states) + + for i in range(num_layers): + new_key_states = self.convert_to_fp16(curr_key_values[i][0]) + new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + + print("start compiling") + self.compile() + + def attention(self, + *, + hidden_states, + position_ids, + attention_mask, + past_key, + past_value, + cos, + sin, + mode, + num_heads, + head_dim, + seq_len, + q_bias=None, + k_bias=None, + v_bias=None): + hidden_size = num_heads * head_dim + proj = self.linear( + hidden_states, + 3 * hidden_size, + hidden_size, + bias=False, + wt_dtype=self.dtype + ) + proj = self.reshape(proj, [-1, 3, hidden_size]) # b*s, 3, h + proj = self.unsqueeze(proj, [0]) # b, s, 3, h + proj = self.transpose(proj, [2, 1, 0, 3]) # 3, s, b, h + proj = self.squeeze(proj) # 3, b*s, h + query_states = self.reshape(proj[0, ...], [1, self.seq_len, num_heads, head_dim]) + query_states = self.transpose(query_states, [0, 2, 1, 3]) + key_states = self.reshape(proj[1, ...], [1, self.seq_len, num_heads, head_dim]) + key_states = self.transpose(key_states, [0, 2, 1, 3]) + value_states = self.reshape(proj[2, ...], [1, self.seq_len, num_heads, head_dim]) + if self.transpose_value: + value_states = self.transpose(value_states, [0, 2, 3, 1]) + else: + value_states = self.transpose(value_states, [0, 2, 1, 3]) + + cos = self.unsqueeze(self.squeeze(cos), [0]) + sin = self.unsqueeze(self.squeeze(sin), [0]) + + query_states, key_states = self.apply_rotary_pos_emb( + q=query_states, + k=key_states, + cos=cos, + sin=sin, + position_ids=position_ids, + num_heads=num_heads, + seq_len=seq_len, + head_dim=head_dim, + ) + new_key_states = key_states + new_value_states = value_states + + if self.mode == "decode": + key_states = self.concat(past_key, key_states, axis=-2) + if self.transpose_value: + value_states = self.concat(past_value, value_states, axis=-1) + else: + value_states = self.concat(past_value, value_states, axis=-2) + + attn_weight = self.matmul(query_states, key_states, False, True) / ( + math.sqrt(self.head_dim)) + attn_weight = self.eltwise_add(attn_weight, attention_mask) + attn_weight = self.convert_to_fp32(attn_weight) + attn_weight = self.softmax(attn_weight, -1) + attn_weight = self.convert_to_fp16(attn_weight) + attn_output = self.matmul(attn_weight, value_states, False, self.transpose_value) + + attn_output = self.transpose(attn_output, [0, 2, 1, 3]) + attn_output = self.reshape(attn_output, [1, seq_len, hidden_size]) + + attn_output = self.linear( + attn_output, hidden_size, hidden_size, bias=False, wt_dtype=self.dtype + ) + return attn_output, new_key_states, new_value_states + + def build_decoder( + self, + hidden_states, + attention_mask, + position_ids, + input_layernorm_weight, + post_attention_layernorm_weight, + past_key=None, + past_value=None, + ): + + residual = hidden_states + + input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) + input_2d = self.layer_norm(input_2d, input_layernorm_weight) + + # attention + attn_output, new_key_states, new_value_states = self.attention( + hidden_states=input_2d, + position_ids=position_ids, + attention_mask=attention_mask, + past_key=past_key, + past_value=past_value, + cos=self.cos, + sin=self.sin, + mode=self.mode, + num_heads=self.num_heads, + head_dim=self.head_dim, + seq_len=self.seq_len, + ) + + hidden_states = self.eltwise_add(residual, attn_output) + residual = hidden_states + hidden_states = self.layer_norm(hidden_states, post_attention_layernorm_weight) + hidden_states = self.mlp(hidden_states) + hidden_states = self.eltwise_add(residual, hidden_states) + hidden_states = self.convert_to_fp16(hidden_states) + + return hidden_states, new_key_states, new_value_states + + +class FusedBaichuanLowBitMultiDecoderlayer(torch.nn.Module): + + def __init__( + self, + parameters: List[Tuple[torch.Tensor]], + input_laynorm_weights: List[torch.Tensor], + post_attn_layernorm_weights: List[torch.Tensor], + layer_indexes: List[int], + intra_stages: int, + cached_cos: torch.Tensor, + cached_sin: torch.Tensor, + num_heads: int, + head_dim: int, + # num_key_value_heads: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 1024, + transpose_value: bool = False, + do_print: bool = False, + ): + super().__init__() + + self.do_print = do_print + + op_parameters = [] + for w in parameters: + if isinstance(w, tuple): # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy())) + else: + op_parameters.append(w.to(torch.float16).numpy()) + self.op_parameters = op_parameters + self.op_id = str(uuid.uuid4()) + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + if isinstance(parameters[0], tuple): + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.intra_stages = intra_stages + self.layer_indexes = layer_indexes + num_layers = len(self.layer_indexes) // intra_stages + self.layer_ranges = [] + for i in range(intra_stages): + if i == intra_stages - 1: + self.layer_ranges.append((i * num_layers, len(self.layer_indexes))) + else: + self.layer_ranges.append((i * num_layers, (i + 1) * num_layers)) + + self.backend_decoders = [] + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + lm_0 = input_laynorm_weights[start:end] + lm_1 = post_attn_layernorm_weights[start:end] + decoder = LowBitBaichuanMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=lm_0, + post_attn_layernorm_weights=lm_1, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + # num_key_value_heads=num_key_value_heads, + num_layers=end - start, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="decode", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.backend_decoders.append(decoder) + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].set_weights(self.op_id, op_parameters[start * 5:end * 5]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> torch.Tensor: + + inputs = ( + hidden_states.to(torch.float16), + attention_mask, + position_ids, + ) + + for i in range(self.intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end]) + + hidden_states, new_keys, new_values = LowBitBaichuanMultiDecoderlayer.run_decoders( + inputs, + decoders=self.backend_decoders) + + if self.do_print: + print("outputs:", hidden_states) + + outputs = (hidden_states,) + outputs += (past_key_value, new_keys, new_values) + return outputs + + def post_forward(self, past_key_value, new_keys, new_values): + key_value_states = [] + for i in range(self.intra_stages): + for j in range(1, len(self.backend_decoders[i].torch_out)): + key_value_states.append(self.backend_decoders[i].torch_out[j]) + + cache_kwargs = { + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + + for i in range(len(self.layer_indexes)): + key_states, value_states = past_key_value.update( + new_keys[i], + new_values[i], + self.layer_indexes[i], + cache_kwargs, + ) + + for i in range(self.intra_stages): + self.backend_decoders[i].load_cache_async() + + +class FusedBaichuanLowBitDecoderlayer(torch.nn.Module): + """LLAMA MLP operation NPU backend.""" + + def __init__( + self, + parameters: List[torch.Tensor], + cached_cos, + cached_sin, + layer_norm_0, + layer_norm_1, + num_heads: int, + # num_key_value_heads: int, + layer_idx: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 128, + transpose_value: bool = False, + ): + super().__init__() + self.op_parameters = parameters + self.op_id = str(uuid.uuid4()) + self.layer_idx = layer_idx + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + # self.rotary_emb = rotary_emb + if isinstance(parameters[0], tuple): # weight, scale from QuantizedLinear + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.backend_cls_prefill = partial( + LowBitBaichuanMultiDecoderlayer, + num_heads=num_heads, + # num_key_value_heads=num_key_value_heads, + num_layers=1, + cached_cos=cached_cos, + cached_sin=cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="prefill", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.layer_norm_0 = layer_norm_0 + self.layer_norm_1 = layer_norm_1 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> torch.Tensor: + """Torch module forward method. + + Args: + x (torch.Tensor): Input tensor + + Returns: + torch.Tensor: result + """ + + seq_len = hidden_states.shape[1] + + backend_cls = self.backend_cls_prefill + inputs = (hidden_states.to(torch.float16), attention_mask, position_ids) + inputs += (self.layer_norm_0, self.layer_norm_1) + hidden_states, past_key, past_value = run_model( + inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + ) + cache_kwargs = { + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + key_states, value_states = past_key_value.update( + past_key, past_value, self.layer_idx, cache_kwargs + ) + + outputs = (hidden_states,) + outputs += (past_key_value,) + return outputs + + +def run_decode( + model, + rank, + world_size, + port, + layer_start, + layer_end, + intra_stages, + max_seq_len, + transpose_value_cache, + input_queue, + result_queue, +): + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + print("start init process group, rank: ", rank, "world_size: ", world_size) + + dist.init_process_group() + my_rank = dist.get_rank() + my_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {my_size}") + + num_heads = model.model.layers[layer_start].self_attn.num_heads + # num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.W_pack.weight, attn_layer.W_pack.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + + multi_decoder = FusedBaichuanLowBitMultiDecoderlayer( + parameters=layer_weights, + input_laynorm_weights=input_layer_norm_weights, + post_attn_layernorm_weights=post_attn_layernorm_weights, + layer_indexes=layer_indexs, + intra_stages=intra_stages, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + head_dim=head_dim, + # num_key_value_heads=num_key_value_heads, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_seq_len, + transpose_value=transpose_value_cache, + do_print=False, + ) + + dist.barrier() + + past_key_values = None + + control = torch.empty((), dtype=torch.int) + hidden_states = torch.empty((1, 1, head_dim * num_heads), dtype=torch.float16) + with torch.inference_mode(): + while True: + + dist.broadcast(control, src=0) + if control.item() == -2: + break + elif control.item() == -1: + past_key_values = input_queue.get() + else: + t0 = time.perf_counter() + past_key_values_length = past_key_values.get_seq_length() + seq_length_with_past = 1 + past_key_values_length + position_ids = torch.arange( + past_key_values_length, seq_length_with_past, dtype=torch.long + ) + position_ids = position_ids.unsqueeze(0).view(-1, 1) + attention_mask = torch.ones((1, seq_length_with_past), dtype=torch.bool) + attention_mask = model.model._prepare_decoder_attention_mask( + attention_mask, (1, 1), hidden_states, past_key_values_length + ) + + pad_len = multi_decoder.max_seq_len + 1 - attention_mask.size(-1) + + pad_mask = (0, pad_len) + padded_causal_mask = F.pad( + attention_mask.to(torch.float16), pad_mask, value=torch.finfo(torch.float16).min + ) + padded_causal_mask[:, :, :, -1] = 0.0 + dist.recv(hidden_states, src=rank - 1) + t1 = time.perf_counter() + layer_outputs = multi_decoder( + hidden_states, + attention_mask=padded_causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + ) + t2 = time.perf_counter() + hidden_states = layer_outputs[0] + t3 = time.perf_counter() + dist.send(hidden_states, dst=(rank + 1) % world_size) + t4 = time.perf_counter() + past_key_values = layer_outputs[1] + new_keys = layer_outputs[2] + new_values = layer_outputs[3] + multi_decoder.post_forward(past_key_values, new_keys, new_values) + + +class DecodeRunner: + def __init__(self, model, max_seq_len, intra_pp=2, inter_pp=2, transpose_value_cache=True): + self.model = model + self.max_seq_len = max_seq_len + self.transpose_value_cache = transpose_value_cache + world_size = inter_pp + 1 + intra_stages = intra_pp + num_layers = self.model.model.config.num_hidden_layers + + port = "54791" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = str(world_size) + + self.input_queues = [] + self.output_queues = [] + self.decoder_processes = [] + + for rank in range(1, world_size): + input_q = mp.Queue() + output_q = mp.Queue() + start_layer = (rank - 1) * (num_layers // (world_size - 1)) + end_layer = (rank) * (num_layers // (world_size - 1)) + if rank == world_size - 1: + end_layer = num_layers + p = mp.Process( + target=run_decode, + args=( + self.model, + rank, + world_size, + port, + start_layer, + end_layer, + intra_stages, + self.max_seq_len, + self.transpose_value_cache, + input_q, + output_q, + ), + ) + p.daemon = True + p.start() + self.input_queues.append(input_q) + self.output_queues.append(output_q) + self.decoder_processes.append(p) + + dist.init_process_group() + my_rank = dist.get_rank() + self.world_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {self.world_size}") + + dist.barrier() + self.cache_past_key_value = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ): + t0 = time.perf_counter() + + if self.cache_past_key_value != past_key_value: + control = torch.tensor(-1, dtype=torch.int) + dist.broadcast(control, src=0) + for i in range(len(self.decoder_processes)): + self.input_queues[i].put(past_key_value) + + control = torch.tensor(0, dtype=torch.int) + dist.broadcast(control, src=0) + hidden_states = hidden_states.to(torch.float16) + dist.send(hidden_states, dst=1) + past_key_value.expand(self.transpose_value_cache) + dist.recv(hidden_states, src=self.world_size - 1) + t1 = time.perf_counter() + return hidden_states, past_key_value + + def shutdown(self): + control = torch.tensor(-2, dtype=torch.int) + dist.broadcast(control, src=0) + for p in self.decoder_processes: + p.join(3) + for p in self.decoder_processes: + if p.exitcode is None: + p.kill() + + def __del__(self): + self.shutdown() + + +def run_prefill( + model, max_output_len, max_prompt_len, transpose_value_cache, input_queue, result_queue +): + + layer_start = 0 + layer_end = len(model.model.layers) + num_heads = model.model.layers[layer_start].self_attn.num_heads + # num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.W_pack.weight, attn_layer.W_pack.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + new_decoderlayer = FusedBaichuanLowBitDecoderlayer( + weights, + num_heads=num_heads, + # num_key_value_heads=num_key_value_heads, + cached_cos=cached_cos, + cached_sin=cached_sin, + layer_norm_0=layer_norm_0, + layer_norm_1=layer_norm_1, + layer_idx=layer_idx, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_output_len, + transpose_value=transpose_value_cache, + ) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + model.model.layers[layer_idx] = new_decoderlayer + deocderlayers.append(new_decoderlayer) + + print("finish creating all decode layers in prefill") + result_queue.put("loading finish") + + while True: + + result = input_queue.get() + if result == "stop": + break + + hidden_states, position_ids, causal_mask, past_key_values = result + with torch.inference_mode(): + for decoder_layer in deocderlayers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + # cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + next_decoder_cache = layer_outputs[1] + + result_queue.put((hidden_states, next_decoder_cache)) + + +class PrefillRunner: + def __init__(self, model, max_output_len, max_prompt_len, transpose_value_cache): + self.model = model + self.max_output_len = max_output_len + self.max_prompt_len = max_prompt_len + self.transpose_value_cache = transpose_value_cache + + self.prefill_result_queue = mp.Queue() + self.prefill_input_queue = mp.Queue() + + self.p = mp.Process( + target=run_prefill, + args=( + model, + max_output_len, + max_prompt_len, + transpose_value_cache, + self.prefill_input_queue, + self.prefill_result_queue, + ), + ) + self.p.daemon = True + self.p.start() + output = self.prefill_result_queue.get() + print(Fore.GREEN + f"prefill process output: {output}") + print(Style.RESET_ALL) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ): + seq_len = hidden_states.size(1) + invalidInputError( + seq_len <= self.max_prompt_len, + ( + f"seq_len: {seq_len} should be less than or equal" + " to max_prompt_len {self.max_prompt_len}" + ), + ) + pad_len = self.max_prompt_len - seq_len + hidden_states = F.pad(hidden_states.to(torch.float16), (0, 0, 0, pad_len), value=0.0) + position_ids = F.pad(position_ids, (0, pad_len), value=0) + attention_mask = F.pad( + attention_mask.to(torch.float16), + (0, pad_len, 0, pad_len), + value=torch.finfo(torch.float16).min, + ) + + args = (hidden_states, position_ids, attention_mask, past_key_value) + self.prefill_input_queue.put(args) + hidden_states, past_key_value = self.prefill_result_queue.get() + past_key_value.shrink(seq_len, self.transpose_value_cache) + hidden_states = hidden_states[:, :seq_len, :] + return hidden_states, past_key_value + + def shutdown(self): + self.prefill_input_queue.put("stop") + self.p.join(3) + if self.p.exitcode is None: + self.p.kill() + + def __del__(self): + self.shutdown() + + +def gen_baichuan_fused_model_forward(prefill_runner, decode_runner): + def baichuan_fused_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + t0 = time.perf_counter() + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + invalidInputError(False, "You cannot specify both decoder_input_ids\ + and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + invalidInputError(False, "You have to specify either decoder_input_ids\ + or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + # ipex-llm changes start + from ipex_llm.transformers.npu_models.kv import DynamicFusedNormalCache + + if use_cache and not isinstance(past_key_values, DynamicFusedNormalCache): + past_key_values = DynamicFusedNormalCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_seq_length() + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, + dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing.\ + Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + if seq_length == 1: + layers_runner = decode_runner + else: + layers_runner = prefill_runner + layer_outputs = layers_runner.forward( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[1] + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # ipex-llm changes start + next_cache = next_decoder_cache if use_cache else None + # ipex-llm changes end + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + t1 = time.perf_counter() + # print("fused model forward time: ", t1 - t0) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + return baichuan_fused_model_forward diff --git a/python/llm/src/ipex_llm/transformers/npu_models/common.py b/python/llm/src/ipex_llm/transformers/npu_models/common.py index bb08b1abea5..32841838d6d 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/common.py @@ -30,3 +30,13 @@ def merge_linear(linears: List[torch.nn.Linear]) -> torch.nn.Linear: new_linear.in_features = new_weight.size(1) new_linear.out_features = new_weight.size(0) return new_linear + + +def reshape_lm_head_input(x): + if x.dim() > 3: + x = x.reshape([-1, x.shape[-2], x.shape[-1]]) + shape = list(x.size()) + if shape[1] > 10: + shape[1] = 1 + x = x[:, -1, :].view(shape) + return x diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index cd4b5fed346..d2df29771b6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -31,7 +31,7 @@ def module_optimization(func) -> torch.nn.Module: torch.nn.Module: optimized module """ - def wrapper(model: torch.nn.Module, qtype, device, *args, **kwargs): + def wrapper(model: torch.nn.Module, qtype, device, modules_to_not_convert, *args, **kwargs): """Recursively apply the optimization function. Args: @@ -41,29 +41,31 @@ def wrapper(model: torch.nn.Module, qtype, device, *args, **kwargs): """ for name, layer in model.named_children(): - new_layer = func(layer, qtype, device, *args, **kwargs) - if new_layer: - model.add_module(name, new_layer) - wrapper(new_layer, qtype, device, *args, **kwargs) - else: - wrapper(layer, qtype, device, *args, **kwargs) + if name not in modules_to_not_convert: + new_layer = func(layer, qtype, device, modules_to_not_convert, *args, **kwargs) + if new_layer: + model.add_module(name, new_layer) + wrapper(new_layer, qtype, device, modules_to_not_convert, *args, **kwargs) + else: + wrapper(layer, qtype, device, modules_to_not_convert, *args, **kwargs) return wrapper @module_optimization -def replace_with_QuantizedLinear(layer, qtype, device): +def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert): from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] - if isinstance(layer, torch.nn.Linear): + if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): if qtype == "sym_int4_rtn": # workaround for qwen2 & int4 if (layer.in_features == 3584 and layer.out_features == 152064) or \ (layer.in_features == 18944 and layer.out_features == 3584): qtype = "sym_int8_rtn" iqtype = ggml_tensor_qtype[qtype] - qweights, scale = ggml_convert_qtype(layer.weight.data, iqtype, device=device) + qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32), + iqtype, device=device) return QuantizedLinear(qweights, scale, layer.bias) @@ -79,15 +81,14 @@ def optimize_llm(model: torch.nn.Module): if model.config.model_type == "llama": from ipex_llm.transformers.npu_models.llama import merge_qkv from ipex_llm.transformers.npu_models.llama import merge_mlp - model.apply(merge_qkv) - model.apply(merge_mlp) - from ipex_llm.transformers.npu_models.llama import llama_model_forward from ipex_llm.transformers.npu_models.llama import llama_attention_forward from ipex_llm.transformers.npu_models.llama import llama_mlp_forward from transformers.models.llama.modeling_llama import LlamaModel from transformers.models.llama.modeling_llama import LlamaAttention from transformers.models.llama.modeling_llama import LlamaMLP + model.apply(merge_qkv) + model.apply(merge_mlp) convert_forward(model, LlamaModel, llama_model_forward) convert_forward(model, LlamaAttention, llama_attention_forward) convert_forward(model, LlamaMLP, llama_mlp_forward) @@ -207,3 +208,28 @@ def optimize_llm(model: torch.nn.Module): from ipex_llm.transformers.npu_models.phi3 import phi3_attention_forward convert_forward(model, module.Phi3Attention, phi3_attention_forward) + + +def optimize_llm_post(model: torch.nn.Module): + # experimental support for fused decoderlayer implementation + if model.config.model_type == "llama": + model.model.embed_tokens.to(torch.float32) + model.model.norm.to(torch.float32) + model.lm_head.to(torch.float32) + + from ipex_llm.transformers.low_bit_linear import LowBitLinear, \ + ggml_tensor_qtype, FP4Params + + if isinstance(model.lm_head, torch.nn.Linear): + new_linear = LowBitLinear(model.lm_head.in_features, + model.lm_head.out_features, + ggml_tensor_qtype["sym_int4"], + False) + paramsLowBit = FP4Params(data=model.lm_head.weight.data, + requires_grad=False, + quantized=False, + _shape=None, + qtype=ggml_tensor_qtype["sym_int4"], + in_features=model.lm_head.in_features).to("cpu") + new_linear._parameters['weight'] = paramsLowBit + model.lm_head = new_linear diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py new file mode 100644 index 00000000000..0c70bf635b0 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -0,0 +1,230 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import torch +import importlib +from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params + + +def convert_forward(m, target_m, new_forward): + if m.__class__ == target_m: + bound_method = new_forward.__get__(m, m.__class__) + setattr(m, "forward", bound_method) + for _, sub_m in m.named_children(): + convert_forward(sub_m, target_m, new_forward) + + +def optimize_llm_pre(model: torch.nn.Module, qtype): + if model.config.model_type == "baichuan": + # process NormHead module in Baichuan2 7B + if hasattr(model, 'lm_head') and model.lm_head is not None: + vocab_size, hidden_size = model.lm_head.weight.shape + lm_head_weight_data = model.lm_head.weight.data + model.lm_head = torch.nn.Linear(hidden_size, vocab_size, bias=False, + device=lm_head_weight_data.device) + if model.lm_head.weight.data.device != "meta": + norm_weight = torch.nn.functional.normalize(lm_head_weight_data) + model.lm_head.weight.data = norm_weight + if model.config.hidden_size in [4096, 2048]: + from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq + model.apply(pre_compute_inv_freq) + + # MiniCPM-V 2.6 and minicpm-2b must put lm_head on CPU now + cpu_lm_head = ( + (model.config.model_type == "minicpmv" and model.config.hidden_size == 3584 and + model.config.vocab_size == 151666) + or ( + model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40 + ) + or os.environ.get("IPEX_LLM_CPU_LM_HEAD", "0") != "0" + ) + + if model.config.model_type == "minicpmv" and hasattr(model, "llm"): + # MiniCPM-V + if model.config.hidden_size == 2304 and model.config.vocab_size == 122753: + # MiniCPM-V 2 + model.llm.config.model_type = "minicpm" + elif model.config.hidden_size == 3584 and model.config.vocab_size == 151666: + # MiniCPM-V 2.6 + model.llm.config.model_type = "qwen2" + elif model.config.hidden_size == 4096 and model.config.vocab_size == 128256: + # MiniCPM-V 2.5 + model.llm.config.model_type = "llama" + model = model.llm + + if model.config.model_type == "qwen2": + from ipex_llm.transformers.npu_models.qwen2_mp import split_mlp_down_proj + from ipex_llm.transformers.npu_models.qwen2_mp import split_mlp_forward + model.apply(split_mlp_down_proj) + + # lm_head to cpu optimization + if cpu_lm_head: + # disable the optimization by default + from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8 + if qtype == "sym_int4_rtn": + lm_qtype = SYM_INT4 + else: + lm_qtype = SYM_INT8 + # lm_head opt to mp opt (llama, qwen2) + optimize_lm_head = model.config.model_type not in ["llama", "qwen2"] + new_linear = LowBitLinear(model.lm_head.in_features, + model.lm_head.out_features, + lm_qtype, + False, + optimize_lm_head=optimize_lm_head) + paramsLowBit = FP4Params(data=model.lm_head.weight.data, + requires_grad=False, + quantized=False, + _shape=None, + qtype=lm_qtype, + in_features=model.lm_head.in_features).to("cpu") + new_linear._parameters['weight'] = paramsLowBit + model.lm_head = new_linear + + +def optimize_llm( + model: torch.nn.Module, + max_output_len=1024, + max_prompt_len=1024, + inter_pp=None, + intra_pp=None, + transpose_value_cache=True, +): + if model.config.model_type == "llama": + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 2 + + from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward + from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner + from transformers.models.llama.modeling_llama import LlamaModel + + decode_runner = DecodeRunner( + model, + max_seq_len=max_output_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + prefill_runner = PrefillRunner( + model, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + ) + llama_model_forward = gen_llama_fused_model_forward( + prefill_runner=prefill_runner, decode_runner=decode_runner + ) + convert_forward(model, LlamaModel, llama_model_forward) + from transformers.models.llama.modeling_llama import LlamaForCausalLM + from ipex_llm.transformers.npu_models.llama_mp import llama2_casullm_forward + convert_forward(model, LlamaForCausalLM, llama2_casullm_forward) + elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28: + # for qwen2-1.5B and qwen2-7B + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 4 if model.config.intermediate_size == 18944 else 1 + + from ipex_llm.transformers.npu_models.qwen2_mp import gen_qwen2_fused_model_forward + from ipex_llm.transformers.npu_models.qwen2_mp import DecodeRunner, PrefillRunner + from transformers.models.qwen2.modeling_qwen2 import Qwen2Model + + decode_runner = DecodeRunner( + model, + max_seq_len=max_output_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + prefill_runner = PrefillRunner( + model, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + ) + qwen2_model_forward = gen_qwen2_fused_model_forward( + prefill_runner=prefill_runner, decode_runner=decode_runner + ) + convert_forward(model, Qwen2Model, qwen2_model_forward) + from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM + from ipex_llm.transformers.npu_models.qwen2_mp import qwen2_casullm_forward + convert_forward(model, Qwen2ForCausalLM, qwen2_casullm_forward) + elif model.config.model_type == "minicpm": + # for minicpm-1b + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 2 + + from ipex_llm.transformers.npu_models.minicpm_mp import gen_minicpm_fused_model_forward + from ipex_llm.transformers.npu_models.minicpm_mp import DecodeRunner, PrefillRunner + + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + + if model.config.num_hidden_layers == 52: + # for minicpm-1b + transpose_cache = transpose_value_cache + elif model.config.num_hidden_layers == 40: + # for minicpm-2b + transpose_cache = False + + decode_runner = DecodeRunner( + model, + max_seq_len=max_output_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_cache, + ) + prefill_runner = PrefillRunner( + model, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_cache, + ) + minicpm_model_forward = gen_minicpm_fused_model_forward( + prefill_runner=prefill_runner, decode_runner=decode_runner + ) + convert_forward(model, module.MiniCPMModel, minicpm_model_forward) + elif model.config.model_type == "baichuan" and model.config.num_hidden_layers == 32: + # for Baichuan2-7B + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 2 + from ipex_llm.transformers.npu_models.baichuan_mp import gen_baichuan_fused_model_forward + from ipex_llm.transformers.npu_models.baichuan_mp import DecodeRunner, PrefillRunner + decode_runner = DecodeRunner( + model, + max_seq_len=max_output_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + ) + prefill_runner = PrefillRunner( + model, + max_output_len=max_output_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + ) + baichuan_model_forward = gen_baichuan_fused_model_forward( + prefill_runner=prefill_runner, decode_runner=decode_runner + ) + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + convert_forward(model, module.BaichuanModel, baichuan_model_forward) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/kv.py b/python/llm/src/ipex_llm/transformers/npu_models/kv.py new file mode 100644 index 00000000000..4f112a1ca86 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/kv.py @@ -0,0 +1,232 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import torch +from typing import Optional, Dict, Tuple, Any +from transformers.cache_utils import DynamicCache +import sys + + +def init_fused_kv_cache( + batch_size, num_heads, head_dim, current_length, max_length, dtype, device, tranpose_value=False +): + if not tranpose_value: + key_cache_storage = torch.zeros( + batch_size, num_heads, max_length, head_dim, dtype=dtype, device=device + ) + value_cache_storage = torch.zeros( + batch_size, num_heads, max_length, head_dim, dtype=dtype, device=device + ) + + key_cache = key_cache_storage.as_strided( + (batch_size, num_heads, current_length, head_dim), + key_cache_storage.stride(), + storage_offset=0, + ) + value_cache = value_cache_storage.as_strided( + (batch_size, num_heads, current_length, head_dim), + value_cache_storage.stride(), + storage_offset=0, + ) + return key_cache, value_cache + else: + key_cache_storage = torch.zeros( + batch_size, num_heads, max_length, head_dim, dtype=dtype, device=device + ) + value_cache_storage = torch.zeros( + batch_size, num_heads, head_dim, max_length, dtype=dtype, device=device + ) + + key_cache = key_cache_storage.as_strided( + (batch_size, num_heads, current_length, head_dim), + key_cache_storage.stride(), + storage_offset=0, + ) + value_cache = value_cache_storage.as_strided( + (batch_size, num_heads, head_dim, current_length), + value_cache_storage.stride(), + storage_offset=0, + ) + return key_cache, value_cache.transpose(-1, -2) + + +def append_fused_kv_cache(cache_k, cache_v, key_states, value_states, transpose_value=False): + if not transpose_value: + new_size = ( + cache_k.size(0), + cache_k.size(1), + cache_k.size(2) + key_states.size(2), + cache_k.size(3), + ) + new_cache_k = cache_k.as_strided(new_size, cache_k.stride(), storage_offset=0) + new_cache_k[:, :, cache_k.size(2):cache_k.size(2) + key_states.size(2), :] = key_states + new_cache_v = cache_v.as_strided(new_size, cache_v.stride(), storage_offset=0) + new_cache_v[:, :, cache_v.size(2):cache_v.size(2) + key_states.size(2), :] = value_states + return new_cache_k, new_cache_v + else: + new_size_key = ( + cache_k.size(0), + cache_k.size(1), + cache_k.size(2) + key_states.size(2), + cache_k.size(3), + ) + new_cache_k = cache_k.as_strided(new_size_key, cache_k.stride(), storage_offset=0) + new_cache_k[:, :, cache_k.size(2):cache_k.size(2) + key_states.size(2), :] = key_states + + new_size_value = ( + cache_v.size(0), + cache_v.size(1), + cache_v.size(3), + cache_v.size(2) + value_states.size(3), + ) + raw_cache_v = cache_v.transpose(-1, -2) + new_cache_v = raw_cache_v.as_strided(new_size_value, raw_cache_v.stride(), storage_offset=0) + start = raw_cache_v.size(3) + end = raw_cache_v.size(3) + value_states.size(3) + new_cache_v[:, :, :, start:end] = value_states + return new_cache_k, new_cache_v.transpose(-1, -2) + + +def expand_fused_kv_cache(cache_k, cache_v, transpose_value=False): + if not transpose_value: + new_size = (cache_k.size(0), cache_k.size(1), cache_k.size(2) + 1, cache_k.size(3)) + new_cache_k = cache_k.as_strided(new_size, cache_k.stride(), storage_offset=0) + new_cache_v = cache_v.as_strided(new_size, cache_v.stride(), storage_offset=0) + return new_cache_k, new_cache_v + else: + new_size_key = (cache_k.size(0), cache_k.size(1), cache_k.size(2) + 1, cache_k.size(3)) + new_cache_k = cache_k.as_strided(new_size_key, cache_k.stride(), storage_offset=0) + new_size_value = ( + cache_v.size(0), + cache_v.size(1), + cache_v.size(3), + cache_v.size(2) + 1, + ) + raw_cache_v = cache_v.transpose(-1, -2) + new_cache_v = raw_cache_v.as_strided(new_size_value, raw_cache_v.stride(), storage_offset=0) + return new_cache_k, new_cache_v.transpose(-1, -2) + + +def shrink_fused_kv_cache(cache_k, cache_v, new_seq_len, transpose_value=False): + if not transpose_value: + new_size = (cache_k.size(0), cache_k.size(1), new_seq_len, cache_k.size(3)) + new_cache_k = cache_k.as_strided(new_size, cache_k.stride(), storage_offset=0) + new_cache_v = cache_v.as_strided(new_size, cache_v.stride(), storage_offset=0) + return new_cache_k, new_cache_v + else: + new_size_key = (cache_k.size(0), cache_k.size(1), new_seq_len, cache_k.size(3)) + new_cache_k = cache_k.as_strided(new_size_key, cache_k.stride(), storage_offset=0) + new_size_value = ( + cache_v.size(0), + cache_v.size(1), + cache_v.size(3), + new_seq_len, + ) + raw_cache_v = cache_v.transpose(-1, -2) + new_cache_v = raw_cache_v.as_strided(new_size_value, raw_cache_v.stride(), storage_offset=0) + return new_cache_k, new_cache_v.transpose(-1, -2) + + +class DynamicFusedNormalCache(DynamicCache): + # Experimental support for fused decoderlayer implementation on NPU + # Currently only for llama2 + + def __init__(self) -> None: + self.key_cache: Dict[int, torch.Tensor] = {} + self.value_cache: Dict[int, torch.Tensor] = {} + self.min_layer_idx = sys.maxsize + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]]=None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + + batch_size, num_heads, seq_len, head_dim = key_states.shape + + max_seq_length = cache_kwargs["max_seq_len"] if "max_seq_len" in cache_kwargs else None + transpose_value = cache_kwargs["transpose"] if "transpose" in cache_kwargs else False + + # Update the cache + # if len(self.key_cache) <= layer_idx: + if layer_idx not in self.key_cache: + max_len = max_seq_length + k_cache, v_cache = init_fused_kv_cache( + batch_size, + num_heads, + head_dim, + 0, + max_len, + # key_states.dtype, + torch.float16, + key_states.device, + tranpose_value=transpose_value, + ) + k_cache, v_cache = append_fused_kv_cache( + k_cache, v_cache, key_states, value_states, transpose_value=transpose_value + ) + + self.key_cache[layer_idx] = k_cache + self.value_cache[layer_idx] = v_cache + else: + k_cache = self.key_cache[layer_idx] + v_cache = self.value_cache[layer_idx] + + kv_seq_len = k_cache.size(2) + key_states.size(2) + k_cache, v_cache = append_fused_kv_cache( + k_cache, v_cache, key_states, value_states, transpose_value=transpose_value + ) + self.key_cache[layer_idx] = k_cache + self.value_cache[layer_idx] = v_cache + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. + A layer index can be optionally passed.""" + + for idx, layer in self.key_cache.items(): + return layer.shape[-2] + return 0 + + def expand(self, transpose_value=True): + for idx, layer in self.key_cache.items(): + key_cache, value_cache = expand_fused_kv_cache( + self.key_cache[idx], + self.value_cache[idx], + transpose_value=transpose_value, + ) + self.key_cache[idx] = key_cache + self.value_cache[idx] = value_cache + + def shrink(self, new_seq_len, transpose_value=True): + for idx, layer in self.key_cache.items(): + key_cache, value_cache = shrink_fused_kv_cache( + self.key_cache[idx], self.value_cache[idx], new_seq_len, transpose_value + ) + self.key_cache[idx] = key_cache + self.value_cache[idx] = value_cache + + @property + def _seen_tokens(self): + return self.get_seq_length() + + @property + def seen_tokens(self): + return self.get_seq_length() diff --git a/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py new file mode 100644 index 00000000000..46c4236f2f1 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/llama_mp.py @@ -0,0 +1,1025 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time + +from typing import Optional, Sequence, List, Union, Any, Tuple +import numpy as np + +from transformers.cache_utils import Cache +from ipex_llm.utils.common import invalidInputError +from typing import Optional, List, Generator +import uuid +from functools import partial +import torch.nn.functional as F +import torch.nn.parallel +import torch.distributed as dist + +from transformers.utils import logging + +logger = logging.get_logger(__name__) +from colorama import Fore, Back, Style +import torch.multiprocessing as mp +from transformers.cache_utils import Cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from ipex_llm.transformers.npu_models.mp_models_base import run_model +from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory +from ipex_llm.transformers.npu_models.common import reshape_lm_head_input +from transformers.modeling_outputs import CausalLMOutputWithPast +from torch.nn import CrossEntropyLoss + + +class LowBitLlamaMultiDecoderlayer(LLMBaseNNFactory): + def __init__( + self, + # batch_size: int, + # seq_len: int, + # hidden_size: int, + hidden_shape: Sequence[int], + *shapes, + num_heads: int, + num_key_value_heads: int, + num_layers: int, + cached_cos, + cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + mode: str = "prefill", + dtype: np.dtype = np.int8, + max_seq_len: int = 1024, + transpose_value: bool = False, + profile: bool = False, + device: str = "NPU", + rms_norm_eps, + intermediate_size, + ): + super().__init__(max_seq_len=max_seq_len, + transpose_value=transpose_value, + dtype=dtype, + profile=profile, + device=device) + self.max_seq_len = max_seq_len + self.intermediate_size = intermediate_size + self.dtype = dtype + self.cached_cos = cached_cos + self.cached_sin = cached_sin + self.batch_size, self.seq_len, self.hidden_size = hidden_shape + self.mode = mode + self.rms_norm_eps = rms_norm_eps + self.transpose_value = transpose_value + self.num_layers = num_layers + + cos = self.constant(self.cached_cos) + self.cos = self.unsqueeze(cos, axis=0) + + sin = self.constant(self.cached_sin) + self.sin = self.unsqueeze(sin, axis=0) + + if mode == "decode": + self.kv_seq_len = self.max_seq_len + 1 + else: + self.kv_seq_len = self.seq_len + + self.num_heads = num_heads + self.num_key_value_heads = num_key_value_heads + + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + + # define input, the order self.parameter matters + input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size)) + + # Self Attention + if mode == "decode": + attention_mask = self.create_input_op((self.batch_size, 1, 1, self.max_seq_len + 1)) + else: + attention_mask = self.create_input_op((self.batch_size, 1, self.seq_len, self.seq_len)) + + position_ids = self.create_input_op((self.batch_size, self.seq_len)) + past_keys = [] + past_values = [] + if mode == "decode": + for i in range(num_layers): + past_key = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + if transpose_value: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.head_dim, self.max_seq_len) + ) + else: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + past_keys.append(past_key) + past_values.append(past_value) + else: + past_keys = [None] * num_layers + past_values = [None] * num_layers + + if input_layernorm_weights is None: + input_layernorm_weights = [] + post_attn_layernorm_weights = [] + for i in range(num_layers): + input_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + post_attn_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + else: + input_layernorm_weights = [self.constant(w) for w in input_layernorm_weights] + post_attn_layernorm_weights = [self.constant(w) for w in post_attn_layernorm_weights] + + hidden_states = input + + curr_key_values = [] + for i in range(num_layers): + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + past_key=past_keys[i], + past_value=past_values[i], + ) + curr_key_values.append((new_key_states, new_value_states)) + + # define outputs + hidden_states = self.convert_to_fp16(hidden_states) + + for i in range(num_layers): + new_key_states = self.convert_to_fp16(curr_key_values[i][0]) + new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + + print("start compiling") + self.compile() + + def build_decoder( + self, + hidden_states, + attention_mask, + position_ids, + input_layernorm_weight, + post_attention_layernorm_weight, + past_key=None, + past_value=None, + ): + + residual = hidden_states + input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) + input_2d = self.layer_norm(input_2d, input_layernorm_weight) + attn_output, new_key_states, new_value_states = self.attention( + hidden_states=input_2d, + position_ids=position_ids, + attention_mask=attention_mask, + past_key=past_key, + past_value=past_value, + cos=self.cos, + sin=self.sin, + mode=self.mode, + num_heads=self.num_heads, + num_key_value_heads=self.num_key_value_heads, + head_dim=self.head_dim, + seq_len=self.seq_len, + ) + hidden_states = self.eltwise_add(residual, attn_output) + residual = hidden_states + hidden_states = self.layer_norm(hidden_states, post_attention_layernorm_weight) + hidden_states = self.mlp(hidden_states) + hidden_states = self.eltwise_add(residual, hidden_states) + hidden_states = self.convert_to_fp16(hidden_states) + + return hidden_states, new_key_states, new_value_states + + +class FusedLlamaLowBitMultiDecoderlayer(torch.nn.Module): + + def __init__( + self, + parameters: List[Tuple[torch.Tensor]], + input_laynorm_weights: List[torch.Tensor], + post_attn_layernorm_weights: List[torch.Tensor], + layer_indexes: List[int], + intra_stages: int, + cached_cos: torch.Tensor, + cached_sin: torch.Tensor, + num_heads: int, + head_dim: int, + num_key_value_heads: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 1024, + transpose_value: bool = False, + do_print: bool = False, + ): + super().__init__() + + self.do_print = do_print + + op_parameters = [] + for w in parameters: + if isinstance(w, tuple): # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy())) + else: + op_parameters.append(w.to(torch.float16).numpy()) + self.op_parameters = op_parameters + self.op_id = str(uuid.uuid4()) + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + if isinstance(parameters[0], tuple): + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.intra_stages = intra_stages + self.layer_indexes = layer_indexes + num_layers = len(self.layer_indexes) // intra_stages + self.layer_ranges = [] + for i in range(intra_stages): + if i == intra_stages - 1: + self.layer_ranges.append((i * num_layers, len(self.layer_indexes))) + else: + self.layer_ranges.append((i * num_layers, (i + 1) * num_layers)) + + self.backend_decoders = [] + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + lm_0 = input_laynorm_weights[start:end] + lm_1 = post_attn_layernorm_weights[start:end] + decoder = LowBitLlamaMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=lm_0, + post_attn_layernorm_weights=lm_1, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=end - start, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="decode", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.backend_decoders.append(decoder) + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].set_weights(self.op_id, op_parameters[start * 7:end * 7]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> torch.Tensor: + + inputs = ( + hidden_states.to(torch.float16), + attention_mask, + position_ids, + ) + + for i in range(self.intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end]) + + hidden_states, new_keys, new_values = LowBitLlamaMultiDecoderlayer.run_decoders( + inputs, + decoders=self.backend_decoders) + + if self.do_print: + print("outputs:", hidden_states) + + outputs = (hidden_states,) + outputs += (past_key_value, new_keys, new_values) + return outputs + + def post_forward(self, past_key_value, new_keys, new_values, cache_position): + key_value_states = [] + for i in range(self.intra_stages): + for j in range(1, len(self.backend_decoders[i].torch_out)): + key_value_states.append(self.backend_decoders[i].torch_out[j]) + + cache_kwargs = { + "cache_position": cache_position, + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + for i in range(len(self.layer_indexes)): + key_states, value_states = past_key_value.update( + new_keys[i], + new_values[i], + self.layer_indexes[i], + cache_kwargs, + ) + + for i in range(self.intra_stages): + self.backend_decoders[i].load_cache_async() + + +class FusedLlamaLowBitDecoderlayer(torch.nn.Module): + """LLAMA MLP operation NPU backend.""" + + def __init__( + self, + parameters: List[torch.Tensor], + cached_cos, + cached_sin, + layer_norm_0, + layer_norm_1, + num_heads: int, + num_key_value_heads: int, + layer_idx: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 128, + transpose_value: bool = False, + ): + super().__init__() + self.op_parameters = parameters + self.op_id = str(uuid.uuid4()) + self.layer_idx = layer_idx + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + # self.rotary_emb = rotary_emb + if isinstance(parameters[0], tuple): # weight, scale from QuantizedLinear + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.backend_cls_prefill = partial( + LowBitLlamaMultiDecoderlayer, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=1, + cached_cos=cached_cos, + cached_sin=cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="prefill", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.layer_norm_0 = layer_norm_0 + self.layer_norm_1 = layer_norm_1 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> torch.Tensor: + """Torch module forward method. + + Args: + x (torch.Tensor): Input tensor + + Returns: + torch.Tensor: result + """ + + seq_len = hidden_states.shape[1] + + backend_cls = self.backend_cls_prefill + inputs = (hidden_states.to(torch.float16), attention_mask, position_ids) + inputs += (self.layer_norm_0, self.layer_norm_1) + hidden_states, past_key, past_value = run_model( + inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + ) + cache_kwargs = { + "cache_position": cache_position, + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + key_states, value_states = past_key_value.update( + past_key, past_value, self.layer_idx, cache_kwargs + ) + + outputs = (hidden_states,) + outputs += (past_key_value,) + return outputs + + +def run_decode( + model, + rank, + world_size, + port, + layer_start, + layer_end, + intra_stages, + max_seq_len, + transpose_value_cache, + input_queue, + result_queue, +): + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + print("start init process group, rank: ", rank, "world_size: ", world_size) + + dist.init_process_group() + my_rank = dist.get_rank() + my_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {my_size}") + + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + + multi_decoder = FusedLlamaLowBitMultiDecoderlayer( + parameters=layer_weights, + input_laynorm_weights=input_layer_norm_weights, + post_attn_layernorm_weights=post_attn_layernorm_weights, + layer_indexes=layer_indexs, + intra_stages=intra_stages, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + head_dim=head_dim, + num_key_value_heads=num_key_value_heads, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_seq_len, + transpose_value=transpose_value_cache, + do_print=False, + ) + + dist.barrier() + + past_key_values = None + + control = torch.empty((), dtype=torch.int) + hidden_states = torch.empty((1, 1, head_dim * num_heads), dtype=torch.float16) + with torch.inference_mode(): + while True: + + dist.broadcast(control, src=0) + if control.item() == -2: + break + elif control.item() == -1: + past_key_values = input_queue.get() + else: + t0 = time.perf_counter() + past_seen_tokens = past_key_values.get_seq_length() + attention_mask = torch.ones([1, past_seen_tokens + 1], dtype=torch.int64) + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + 1, device=hidden_states.device + ) + + position_ids = position_ids = cache_position.unsqueeze(0) + causal_mask = model.model._update_causal_mask( + attention_mask, hidden_states, cache_position, past_seen_tokens + ) + pad_len = multi_decoder.max_seq_len + 1 - causal_mask.size(-1) + + pad_mask = (0, pad_len) + padded_causal_mask = F.pad( + causal_mask.to(torch.float16), pad_mask, value=torch.finfo(torch.float16).min + ) + padded_causal_mask[:, :, :, -1] = 0.0 + dist.recv(hidden_states, src=rank - 1) + t1 = time.perf_counter() + layer_outputs = multi_decoder( + hidden_states, + attention_mask=padded_causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + cache_position=cache_position, + ) + t2 = time.perf_counter() + hidden_states = layer_outputs[0] + t3 = time.perf_counter() + dist.send(hidden_states, dst=(rank + 1) % world_size) + t4 = time.perf_counter() + past_key_values = layer_outputs[1] + new_keys = layer_outputs[2] + new_values = layer_outputs[3] + multi_decoder.post_forward(past_key_values, new_keys, new_values, cache_position) + + +class DecodeRunner: + def __init__(self, model, max_seq_len, intra_pp=2, inter_pp=2, transpose_value_cache=True): + self.model = model + self.max_seq_len = max_seq_len + self.transpose_value_cache = transpose_value_cache + world_size = inter_pp + 1 + intra_stages = intra_pp + num_layers = self.model.model.config.num_hidden_layers + + port = "54791" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = str(world_size) + + self.input_queues = [] + self.output_queues = [] + self.decoder_processes = [] + + for rank in range(1, world_size): + input_q = mp.Queue() + output_q = mp.Queue() + start_layer = (rank - 1) * (num_layers // (world_size - 1)) + end_layer = (rank) * (num_layers // (world_size - 1)) + if rank == world_size - 1: + end_layer = num_layers + p = mp.Process( + target=run_decode, + args=( + self.model, + rank, + world_size, + port, + start_layer, + end_layer, + intra_stages, + self.max_seq_len, + self.transpose_value_cache, + input_q, + output_q, + ), + ) + p.daemon = True + p.start() + self.input_queues.append(input_q) + self.output_queues.append(output_q) + self.decoder_processes.append(p) + + dist.init_process_group() + my_rank = dist.get_rank() + self.world_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {self.world_size}") + + dist.barrier() + self.cache_past_key_value = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ): + t0 = time.perf_counter() + + if self.cache_past_key_value != past_key_value: + control = torch.tensor(-1, dtype=torch.int) + dist.broadcast(control, src=0) + for i in range(len(self.decoder_processes)): + self.input_queues[i].put(past_key_value) + + control = torch.tensor(0, dtype=torch.int) + dist.broadcast(control, src=0) + hidden_states = hidden_states.to(torch.float16) + dist.send(hidden_states, dst=1) + past_key_value.expand(self.transpose_value_cache) + dist.recv(hidden_states, src=self.world_size - 1) + t1 = time.perf_counter() + return hidden_states, past_key_value + + def shutdown(self): + control = torch.tensor(-2, dtype=torch.int) + dist.broadcast(control, src=0) + for p in self.decoder_processes: + p.join(3) + for p in self.decoder_processes: + if p.exitcode is None: + p.kill() + + def __del__(self): + self.shutdown() + + +def run_prefill( + model, max_output_len, max_prompt_len, transpose_value_cache, input_queue, result_queue +): + + layer_start = 0 + layer_end = len(model.model.layers) + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + new_decoderlayer = FusedLlamaLowBitDecoderlayer( + weights, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + cached_cos=cached_cos, + cached_sin=cached_sin, + layer_norm_0=layer_norm_0, + layer_norm_1=layer_norm_1, + layer_idx=layer_idx, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_output_len, + transpose_value=transpose_value_cache, + ) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + model.model.layers[layer_idx] = new_decoderlayer + deocderlayers.append(new_decoderlayer) + + print("finish creating all decode layers in prefill") + result_queue.put("loading finish") + + while True: + + result = input_queue.get() + if result == "stop": + break + + hidden_states, position_ids, causal_mask, past_key_values, cache_position = result + with torch.inference_mode(): + for decoder_layer in deocderlayers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + next_decoder_cache = layer_outputs[1] + + result_queue.put((hidden_states, next_decoder_cache)) + + +class PrefillRunner: + def __init__(self, model, max_output_len, max_prompt_len, transpose_value_cache): + self.model = model + self.max_output_len = max_output_len + self.max_prompt_len = max_prompt_len + self.transpose_value_cache = transpose_value_cache + + self.prefill_result_queue = mp.Queue() + self.prefill_input_queue = mp.Queue() + + self.p = mp.Process( + target=run_prefill, + args=( + model, + max_output_len, + max_prompt_len, + transpose_value_cache, + self.prefill_input_queue, + self.prefill_result_queue, + ), + ) + self.p.daemon = True + self.p.start() + output = self.prefill_result_queue.get() + print(Fore.GREEN + f"prefill process output: {output}") + print(Style.RESET_ALL) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ): + seq_len = hidden_states.size(1) + invalidInputError( + seq_len <= self.max_prompt_len, + ( + f"seq_len: {seq_len} should be less than or equal" + " to max_prompt_len {self.max_prompt_len}" + ), + ) + pad_len = self.max_prompt_len - seq_len + hidden_states = F.pad(hidden_states.to(torch.float16), (0, 0, 0, pad_len), value=0.0) + position_ids = F.pad(position_ids, (0, pad_len), value=0) + attention_mask = F.pad( + attention_mask.to(torch.float16), + (0, pad_len, 0, pad_len), + value=torch.finfo(torch.float16).min, + ) + + args = (hidden_states, position_ids, attention_mask, past_key_value, cache_position) + self.prefill_input_queue.put(args) + hidden_states, past_key_value = self.prefill_result_queue.get() + past_key_value.shrink(seq_len, self.transpose_value_cache) + hidden_states = hidden_states[:, :seq_len, :] + return hidden_states, past_key_value + + def shutdown(self): + self.prefill_input_queue.put("stop") + self.p.join(3) + if self.p.exitcode is None: + self.p.kill() + + def __del__(self): + self.shutdown() + + +def gen_llama_fused_model_forward(prefill_runner, decode_runner): + + def llama_fused_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + t0 = time.perf_counter() + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + msg = ( + "You cannot specify both input_ids and inputs_embeds at the same time," + " and must specify either one" + ) + invalidInputError(False, msg) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + past_seen_tokens = 0 + + # ipex-llm changes start + from ipex_llm.transformers.npu_models.kv import DynamicFusedNormalCache + + if use_cache and not isinstance(past_key_values, DynamicFusedNormalCache): + past_key_values = DynamicFusedNormalCache.from_legacy_cache(past_key_values) + past_seen_tokens = past_key_values.get_seq_length() + + if cache_position is None: + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + # ipex-llm changes end + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_seen_tokens + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + seq_len = hidden_states.size(1) + + if seq_len == 1: + layers_runner = decode_runner + else: + layers_runner = prefill_runner + layer_outputs = layers_runner.forward( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + next_decoder_cache = layer_outputs[1] + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # ipex-llm changes start + next_cache = next_decoder_cache if use_cache else None + # ipex-llm changes end + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + t1 = time.perf_counter() + # print("fused model forward time: ", t1 - t0) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + return llama_fused_model_forward + + +def llama2_casullm_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None \ + else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + # ipex-llm change start + hidden_states = reshape_lm_head_input(hidden_states) + # ipex-llm change end + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, + dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) + for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/minicpm_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/minicpm_mp.py new file mode 100644 index 00000000000..80abb8bad79 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/minicpm_mp.py @@ -0,0 +1,987 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer +from intel_npu_acceleration_library.backend.factory import NNFactory +from typing import Optional, Sequence, List, Union, Any, Tuple +import numpy as np +import math +from intel_npu_acceleration_library.backend.runtime import set_contiguous, record_function +from intel_npu_acceleration_library.backend.runtime import adapt_output_tensor, _model_cache +from collections import deque +from transformers.cache_utils import Cache +from intel_npu_acceleration_library.backend.bindings import lib as backend_lib +import ctypes +from ipex_llm.utils.common import invalidInputError +from typing import Optional, List, Generator +import uuid +from functools import partial +import torch.nn.functional as F +import torch.nn.parallel +import torch.distributed as dist +from filelock import FileLock + +from transformers.utils import logging + +logger = logging.get_logger(__name__) +import gc +from colorama import Fore, Back, Style +import torch.multiprocessing as mp +from transformers.cache_utils import Cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from ipex_llm.transformers.npu_models.mp_models_base import run_model +from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory + + +class LowBitLlamaMultiDecoderlayer(LLMBaseNNFactory): + def __init__( + self, + # batch_size: int, + # seq_len: int, + # hidden_size: int, + hidden_shape: Sequence[int], + *shapes, + num_heads: int, + num_key_value_heads: int, + num_layers: int, + cached_cos, + cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + mode: str = "prefill", + dtype: np.dtype = np.int8, + max_seq_len: int = 1024, + transpose_value: bool = False, + profile: bool = False, + device: str = "NPU", + rms_norm_eps, + intermediate_size, + scale_depth, + num_hidden_layers + ): + super().__init__(max_seq_len=max_seq_len, + transpose_value=transpose_value, + dtype=dtype, + profile=profile, + device=device) + self.max_seq_len = max_seq_len + self.intermediate_size = intermediate_size + self.dtype = dtype + self.cached_cos = cached_cos + self.cached_sin = cached_sin + self.batch_size, self.seq_len, self.hidden_size = hidden_shape + self.mode = mode + self.rms_norm_eps = rms_norm_eps + self.transpose_value = transpose_value + self.num_layers = num_layers + + cos = self.constant(self.cached_cos) + self.cos = self.unsqueeze(cos, axis=0) + + sin = self.constant(self.cached_sin) + self.sin = self.unsqueeze(sin, axis=0) + + if mode == "decode": + self.kv_seq_len = self.max_seq_len + 1 + else: + self.kv_seq_len = self.seq_len + + self.num_heads = num_heads + self.num_key_value_heads = num_key_value_heads + + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + + # define input, the order self.parameter matters + input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size)) + + # Self Attention + if mode == "decode": + attention_mask = self.create_input_op((self.batch_size, 1, 1, self.max_seq_len + 1)) + else: + attention_mask = self.create_input_op((self.batch_size, 1, self.seq_len, self.seq_len)) + + position_ids = self.create_input_op((self.batch_size, self.seq_len)) + past_keys = [] + past_values = [] + if mode == "decode": + for i in range(num_layers): + past_key = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + if transpose_value: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.head_dim, self.max_seq_len) + ) + else: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + past_keys.append(past_key) + past_values.append(past_value) + else: + past_keys = [None] * num_layers + past_values = [None] * num_layers + + if input_layernorm_weights is None: + input_layernorm_weights = [] + post_attn_layernorm_weights = [] + for i in range(num_layers): + input_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + post_attn_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + else: + input_layernorm_weights = [self.constant(w) for w in input_layernorm_weights] + post_attn_layernorm_weights = [self.constant(w) for w in post_attn_layernorm_weights] + + hidden_states = input + + curr_key_values = [] + for i in range(num_layers): + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + scale_depth=scale_depth, + num_hidden_layers=num_hidden_layers, + past_key=past_keys[i], + past_value=past_values[i], + ) + curr_key_values.append((new_key_states, new_value_states)) + + # define outputs + hidden_states = self.convert_to_fp16(hidden_states) + + for i in range(num_layers): + new_key_states = self.convert_to_fp16(curr_key_values[i][0]) + new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + + print("start compiling") + self.compile() + + def build_decoder( + self, + hidden_states, + attention_mask, + position_ids, + input_layernorm_weight, + post_attention_layernorm_weight, + scale_depth, + num_hidden_layers, + past_key=None, + past_value=None, + ): + + residual = hidden_states + input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) + input_2d = self.layer_norm(input_2d, input_layernorm_weight) + + attn_output, new_key_states, new_value_states = self.attention( + hidden_states=input_2d, + position_ids=position_ids, + attention_mask=attention_mask, + past_key=past_key, + past_value=past_value, + cos=self.cos, + sin=self.sin, + mode=self.mode, + num_heads=self.num_heads, + num_key_value_heads=self.num_key_value_heads, + head_dim=self.head_dim, + seq_len=self.seq_len, + ) + + layer_scale_depth = scale_depth / math.sqrt(num_hidden_layers) + hidden_states = self.eltwise_add(residual, + attn_output * layer_scale_depth) + residual = hidden_states + hidden_states = self.layer_norm(hidden_states, post_attention_layernorm_weight) + hidden_states = self.mlp(hidden_states) + hidden_states = self.eltwise_add(residual, + hidden_states * layer_scale_depth) + hidden_states = self.convert_to_fp16(hidden_states) + + return hidden_states, new_key_states, new_value_states + + +class FusedLlamaLowBitMultiDecoderlayer(torch.nn.Module): + + def __init__( + self, + parameters: List[Tuple[torch.Tensor]], + input_laynorm_weights: List[torch.Tensor], + post_attn_layernorm_weights: List[torch.Tensor], + layer_indexes: List[int], + intra_stages: int, + cached_cos: torch.Tensor, + cached_sin: torch.Tensor, + num_heads: int, + head_dim: int, + num_key_value_heads: int, + rms_norm_eps, + intermediate_size, + scale_depth, + num_hidden_layers, + max_seq_len: int = 1024, + transpose_value: bool = False, + do_print: bool = False, + ): + super().__init__() + + self.do_print = do_print + + op_parameters = [] + for w in parameters: + if isinstance(w, tuple): # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy())) + else: + op_parameters.append(w.to(torch.float16).numpy()) + self.op_parameters = op_parameters + self.op_id = str(uuid.uuid4()) + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + if isinstance(parameters[0], tuple): + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.intra_stages = intra_stages + self.layer_indexes = layer_indexes + num_layers = len(self.layer_indexes) // intra_stages + self.layer_ranges = [] + for i in range(intra_stages): + if i == intra_stages - 1: + self.layer_ranges.append((i * num_layers, len(self.layer_indexes))) + else: + self.layer_ranges.append((i * num_layers, (i + 1) * num_layers)) + + self.backend_decoders = [] + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + lm_0 = input_laynorm_weights[start:end] + lm_1 = post_attn_layernorm_weights[start:end] + decoder = LowBitLlamaMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=lm_0, + post_attn_layernorm_weights=lm_1, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=end - start, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + scale_depth=scale_depth, + num_hidden_layers=num_hidden_layers, + mode="decode", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.backend_decoders.append(decoder) + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].set_weights(self.op_id, op_parameters[start * 7:end * 7]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> torch.Tensor: + + inputs = ( + hidden_states.to(torch.float16), + attention_mask, + position_ids, + ) + + for i in range(self.intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end]) + + hidden_states, new_keys, new_values = LowBitLlamaMultiDecoderlayer.run_decoders( + inputs, + decoders=self.backend_decoders) + + if self.do_print: + print("outputs:", hidden_states) + + outputs = (hidden_states,) + outputs += (past_key_value, new_keys, new_values) + return outputs + + def post_forward(self, past_key_value, new_keys, new_values): + key_value_states = [] + for i in range(self.intra_stages): + for j in range(1, len(self.backend_decoders[i].torch_out)): + key_value_states.append(self.backend_decoders[i].torch_out[j]) + + cache_kwargs = { + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + for i in range(len(self.layer_indexes)): + key_states, value_states = past_key_value.update( + new_keys[i], + new_values[i], + self.layer_indexes[i], + cache_kwargs, + ) + + for i in range(self.intra_stages): + self.backend_decoders[i].load_cache_async() + + +class FusedLlamaLowBitDecoderlayer(torch.nn.Module): + """LLAMA MLP operation NPU backend.""" + + def __init__( + self, + parameters: List[torch.Tensor], + cached_cos, + cached_sin, + layer_norm_0, + layer_norm_1, + num_heads: int, + num_key_value_heads: int, + layer_idx: int, + rms_norm_eps, + intermediate_size, + scale_depth, + num_hidden_layers, + max_seq_len: int = 128, + transpose_value: bool = False, + ): + super().__init__() + self.op_parameters = parameters + self.op_id = str(uuid.uuid4()) + self.layer_idx = layer_idx + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + # self.rotary_emb = rotary_emb + if isinstance(parameters[0], tuple): # weight, scale from QuantizedLinear + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.backend_cls_prefill = partial( + LowBitLlamaMultiDecoderlayer, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=1, + cached_cos=cached_cos, + cached_sin=cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + scale_depth=scale_depth, + num_hidden_layers=num_hidden_layers, + mode="prefill", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.layer_norm_0 = layer_norm_0 + self.layer_norm_1 = layer_norm_1 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> torch.Tensor: + """Torch module forward method. + + Args: + x (torch.Tensor): Input tensor + + Returns: + torch.Tensor: result + """ + + seq_len = hidden_states.shape[1] + + backend_cls = self.backend_cls_prefill + inputs = (hidden_states.to(torch.float16), attention_mask, position_ids) + inputs += (self.layer_norm_0, self.layer_norm_1) + hidden_states, past_key, past_value = run_model( + inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + ) + cache_kwargs = { + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + key_states, value_states = past_key_value.update( + past_key, past_value, self.layer_idx, cache_kwargs + ) + + outputs = (hidden_states,) + outputs += (past_key_value,) + return outputs + + +def run_decode( + model, + rank, + world_size, + port, + layer_start, + layer_end, + intra_stages, + scale_depth, + max_seq_len, + transpose_value_cache, + input_queue, + result_queue, +): + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + print("start init process group, rank: ", rank, "world_size: ", world_size) + + dist.init_process_group() + my_rank = dist.get_rank() + my_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {my_size}") + + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + num_hidden_layers = model.config.num_hidden_layers + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + + multi_decoder = FusedLlamaLowBitMultiDecoderlayer( + parameters=layer_weights, + input_laynorm_weights=input_layer_norm_weights, + post_attn_layernorm_weights=post_attn_layernorm_weights, + layer_indexes=layer_indexs, + intra_stages=intra_stages, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + head_dim=head_dim, + num_key_value_heads=num_key_value_heads, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + scale_depth=scale_depth, + num_hidden_layers=num_hidden_layers, + max_seq_len=max_seq_len, + transpose_value=transpose_value_cache, + do_print=False, + ) + + dist.barrier() + + past_key_values = None + + control = torch.empty((), dtype=torch.int) + hidden_states = torch.empty((1, 1, head_dim * num_heads), dtype=torch.float16) + with torch.inference_mode(): + while True: + + dist.broadcast(control, src=0) + if control.item() == -2: + break + elif control.item() == -1: + past_key_values = input_queue.get() + else: + t0 = time.perf_counter() + past_seen_tokens = past_key_values.get_seq_length() + attention_mask = torch.ones([1, past_seen_tokens + 1], dtype=torch.int64) + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + 1, device=hidden_states.device + ) + + position_ids = cache_position.unsqueeze(0) + from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask + + causal_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (hidden_states.shape[0], hidden_states.shape[1]), + hidden_states, + past_seen_tokens, + ) + pad_len = multi_decoder.max_seq_len + 1 - causal_mask.size(-1) + + pad_mask = (0, pad_len) + padded_causal_mask = F.pad( + causal_mask.to(torch.float16), pad_mask, value=torch.finfo(torch.float16).min + ) + padded_causal_mask[:, :, :, -1] = 0.0 + dist.recv(hidden_states, src=rank - 1) + t1 = time.perf_counter() + layer_outputs = multi_decoder( + hidden_states, + attention_mask=padded_causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + ) + t2 = time.perf_counter() + hidden_states = layer_outputs[0] + t3 = time.perf_counter() + dist.send(hidden_states, dst=(rank + 1) % world_size) + t4 = time.perf_counter() + past_key_values = layer_outputs[1] + new_keys = layer_outputs[2] + new_values = layer_outputs[3] + multi_decoder.post_forward(past_key_values, new_keys, new_values) + + +class DecodeRunner: + def __init__(self, model, max_seq_len, intra_pp=2, inter_pp=2, transpose_value_cache=True): + self.model = model + self.max_seq_len = max_seq_len + self.transpose_value_cache = transpose_value_cache + world_size = inter_pp + 1 + intra_stages = intra_pp + num_layers = self.model.model.config.num_hidden_layers + scale_depth = self.model.model.config.scale_depth + + port = "54791" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = str(world_size) + + self.input_queues = [] + self.output_queues = [] + self.decoder_processes = [] + + for rank in range(1, world_size): + input_q = mp.Queue() + output_q = mp.Queue() + start_layer = (rank - 1) * (num_layers // (world_size - 1)) + end_layer = (rank) * (num_layers // (world_size - 1)) + if rank == world_size - 1: + end_layer = num_layers + p = mp.Process( + target=run_decode, + args=( + self.model, + rank, + world_size, + port, + start_layer, + end_layer, + intra_stages, + scale_depth, + self.max_seq_len, + self.transpose_value_cache, + input_q, + output_q, + ), + ) + p.daemon = True + p.start() + self.input_queues.append(input_q) + self.output_queues.append(output_q) + self.decoder_processes.append(p) + + dist.init_process_group() + my_rank = dist.get_rank() + self.world_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {self.world_size}") + + dist.barrier() + self.cache_past_key_value = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + t0 = time.perf_counter() + + if self.cache_past_key_value != past_key_value: + control = torch.tensor(-1, dtype=torch.int) + dist.broadcast(control, src=0) + for i in range(len(self.decoder_processes)): + self.input_queues[i].put(past_key_value) + + control = torch.tensor(0, dtype=torch.int) + dist.broadcast(control, src=0) + hidden_states = hidden_states.to(torch.float16) + dist.send(hidden_states, dst=1) + past_key_value.expand(self.transpose_value_cache) + dist.recv(hidden_states, src=self.world_size - 1) + t1 = time.perf_counter() + return hidden_states, past_key_value + + def shutdown(self): + control = torch.tensor(-2, dtype=torch.int) + dist.broadcast(control, src=0) + for p in self.decoder_processes: + p.join(3) + for p in self.decoder_processes: + if p.exitcode is None: + p.kill() + + def __del__(self): + self.shutdown() + + +def run_prefill( + model, max_output_len, max_prompt_len, transpose_value_cache, input_queue, result_queue +): + + layer_start = 0 + layer_end = len(model.model.layers) + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + scale_depth = model.config.scale_depth + num_hidden_layers = model.config.num_hidden_layers + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + new_decoderlayer = FusedLlamaLowBitDecoderlayer( + weights, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + cached_cos=cached_cos, + cached_sin=cached_sin, + layer_norm_0=layer_norm_0, + layer_norm_1=layer_norm_1, + layer_idx=layer_idx, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + scale_depth=scale_depth, + num_hidden_layers=num_hidden_layers, + max_seq_len=max_output_len, + transpose_value=transpose_value_cache, + ) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + model.model.layers[layer_idx] = new_decoderlayer + deocderlayers.append(new_decoderlayer) + + print("finish creating all decode layers in prefill") + result_queue.put("loading finish") + + while True: + + result = input_queue.get() + if result == "stop": + break + + hidden_states, position_ids, causal_mask, past_key_values = result + with torch.inference_mode(): + for decoder_layer in deocderlayers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + ) + + hidden_states = layer_outputs[0] + next_decoder_cache = layer_outputs[1] + + result_queue.put((hidden_states, next_decoder_cache)) + + +class PrefillRunner: + def __init__(self, model, max_output_len, max_prompt_len, transpose_value_cache): + self.model = model + self.max_output_len = max_output_len + self.max_prompt_len = max_prompt_len + self.transpose_value_cache = transpose_value_cache + + self.prefill_result_queue = mp.Queue() + self.prefill_input_queue = mp.Queue() + + self.p = mp.Process( + target=run_prefill, + args=( + model, + max_output_len, + max_prompt_len, + transpose_value_cache, + self.prefill_input_queue, + self.prefill_result_queue, + ), + ) + self.p.daemon = True + self.p.start() + output = self.prefill_result_queue.get() + print(Fore.GREEN + f"prefill process output: {output}") + print(Style.RESET_ALL) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + seq_len = hidden_states.size(1) + invalidInputError( + seq_len <= self.max_prompt_len, + ( + f"seq_len: {seq_len} should be less than or equal" + " to max_prompt_len {self.max_prompt_len}" + ), + ) + pad_len = self.max_prompt_len - seq_len + hidden_states = F.pad(hidden_states.to(torch.float16), (0, 0, 0, pad_len), value=0.0) + position_ids = F.pad(position_ids, (0, pad_len), value=0) + attention_mask = F.pad( + attention_mask.to(torch.float16), + (0, pad_len, 0, pad_len), + value=torch.finfo(torch.float16).min, + ) + + args = (hidden_states, position_ids, attention_mask, past_key_value) + self.prefill_input_queue.put(args) + hidden_states, past_key_value = self.prefill_result_queue.get() + past_key_value.shrink(seq_len, self.transpose_value_cache) + hidden_states = hidden_states[:, :seq_len, :] + return hidden_states, past_key_value + + def shutdown(self): + self.prefill_input_queue.put("stop") + self.p.join(3) + if self.p.exitcode is None: + self.p.kill() + + def __del__(self): + self.shutdown() + + +from transformers.cache_utils import Cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.cache_utils import Cache +from transformers.modeling_attn_mask_utils import ( + _prepare_4d_causal_attention_mask, +) + + +def gen_minicpm_fused_model_forward(prefill_runner, decode_runner): + + def minicpm_fused_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + t0 = time.perf_counter() + output_attentions = ( + output_attentions if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + invalidInputError(False, + "You cannot specify both decoder_input_ids and " + "decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + invalidInputError(False, + "You have to specify either input_ids or inputs_embeds") + + from ipex_llm.transformers.npu_models.kv import DynamicFusedNormalCache + + past_key_values_length = 0 + if use_cache and not isinstance(past_key_values, DynamicFusedNormalCache): + past_key_values = DynamicFusedNormalCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_seq_length() + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + if seq_length == 1: + layers_runner = decode_runner + else: + layers_runner = prefill_runner + + layer_outputs = layers_runner.forward( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + next_decoder_cache = layer_outputs[1] + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # ipex-llm changes start + next_cache = next_decoder_cache if use_cache else None + # ipex-llm changes end + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + t1 = time.perf_counter() + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + return minicpm_fused_model_forward diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py new file mode 100644 index 00000000000..611270f8b12 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -0,0 +1,428 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +from intel_npu_acceleration_library.backend.factory import NNFactory +from typing import List, Union, Any +from intel_npu_acceleration_library.backend.runtime import set_contiguous, record_function +from intel_npu_acceleration_library.backend.runtime import adapt_output_tensor, _model_cache +from collections import deque +from intel_npu_acceleration_library.backend.bindings import lib as backend_lib +from ipex_llm.utils.common import invalidInputError +from transformers.utils import logging +from filelock import FileLock +import ctypes +import math +import numpy as np + +logger = logging.get_logger(__name__) + + +@torch.no_grad() +def run_model( + x: Union[torch.Tensor, List[torch.Tensor]], + weights: List[torch.Tensor], + backend_cls: Any, + op_id: str, + replica: int = 1, +) -> torch.Tensor: + global _model_cache + import time + + t0 = time.perf_counter() + + # Use or not op_id depending on the class used + op_kwargs = {"op_id": op_id} if op_id else {} + + if not isinstance(x, (list, tuple)): + x = [x] + + # Reshape input + input_dtype = x[0].dtype + x_np = [set_contiguous(elem).to(torch.float16).numpy() for elem in x] + op_args = [] + op_args_flatten = [] + for w in weights: + if isinstance(w, tuple): # from QuantizedLinear + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy())) + op_args_flatten.append(op_args[-1][0]) + op_args_flatten.append(op_args[-1][1]) + else: + op_args.append(set_contiguous(w).to(torch.float16).numpy()) + op_args_flatten.append(op_args[-1]) + + shape_dtype_signature = "_".join( + ["_".join(str(dim) for dim in t.shape) + f"_{t.dtype}" for t in x_np + op_args_flatten] + ) + key = f"{backend_cls.func.__name__}_{shape_dtype_signature}" + models = _model_cache.get(key, None) + + input_shapes = [elem.shape for elem in x_np] + if models is None: + _model_cache[key] = deque([backend_cls(*input_shapes) for i in range(replica)]) + elif len(models) < 1: + _model_cache[key].append(backend_cls(*input_shapes)) + else: + _model_cache[key].rotate(1) + + # Get the model + model = _model_cache[key][0] + + with record_function(f"npu_factory_mul_{key}"): + ret = model.run(x_np, *op_args, **op_kwargs) + + if isinstance(ret, list): + results = [adapt_output_tensor(r, r.shape, input_dtype) for r in ret] + else: + results = adapt_output_tensor(ret, ret.shape, input_dtype) + + return results + + +class LLMBaseNNFactory(NNFactory): + + def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="NPU"): + super().__init__(profile, device) + self.cache_parameter_ops = [] + self.input_ops = [] + self.linear_ops = [] + self.kv_cache_c_handle = None + self.kv_cache_torch = [] + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + self.dtype = dtype + + def attention(self, + *, + hidden_states, + position_ids, + attention_mask, + past_key, + past_value, + cos, + sin, + mode, + num_heads, + num_key_value_heads, + head_dim, + seq_len, + q_bias=None, + k_bias=None, + v_bias=None): + hidden_size = num_heads * head_dim + num_key_value_groups = num_heads // num_key_value_heads + query_states = self.linear( + hidden_states, + num_heads * head_dim, + hidden_size, + bias=False, + wt_dtype=self.dtype, + ) + if q_bias is not None: + query_states = query_states + q_bias + key_states = self.linear( + hidden_states, + num_key_value_heads * head_dim, + hidden_size, + bias=False, + wt_dtype=self.dtype, + ) + if k_bias is not None: + key_states = key_states + k_bias + value_states = self.linear( + hidden_states, + num_key_value_heads * head_dim, + hidden_size, + bias=False, + wt_dtype=self.dtype, + ) + if v_bias is not None: + value_states = value_states + v_bias + + query_states = self.reshape( + query_states, [1, seq_len, num_heads, head_dim] + ) + key_states = self.reshape( + key_states, [1, seq_len, num_key_value_heads, head_dim] + ) + value_states = self.reshape( + value_states, [1, seq_len, num_key_value_heads, head_dim] + ) + + query_states = self.transpose(query_states, [0, 2, 1, 3]) + key_states = self.transpose(key_states, [0, 2, 1, 3]) + if self.transpose_value: + value_states = self.transpose(value_states, [0, 2, 3, 1]) + else: + value_states = self.transpose(value_states, [0, 2, 1, 3]) + + query_states, key_states = self.apply_rotary_pos_emb( + q=query_states, + k=key_states, + cos=cos, + sin=sin, + position_ids=position_ids, + num_heads=num_heads, + seq_len=seq_len, + head_dim=head_dim, + ) + new_key_states = key_states + new_value_states = value_states + + if mode == "decode": + key_states = self.concat(past_key, key_states, axis=-2) + if self.transpose_value: + value_states = self.concat(past_value, value_states, axis=-1) + else: + value_states = self.concat(past_value, value_states, axis=-2) + kv_seq_len = self.max_seq_len + 1 + else: + kv_seq_len = seq_len + + key_states = self.repeat_kv(hidden_states=key_states, + n_rep=num_key_value_groups, + num_key_value_heads=num_key_value_heads, + kv_seq_len=kv_seq_len, + head_dim=head_dim,) + value_states = self.repeat_kv(hidden_states=value_states, + n_rep=num_key_value_groups, + num_key_value_heads=num_key_value_heads, + kv_seq_len=kv_seq_len, + head_dim=head_dim, + transpose=self.transpose_value) + attn_weight = self.matmul(query_states, key_states, False, True) / ( + math.sqrt(head_dim) + ) + attn_weight = self.eltwise_add(attn_weight, attention_mask) + attn_weight = self.convert_to_fp32(attn_weight) + attn_weight = self.softmax(attn_weight, -1) + attn_weight = self.convert_to_fp16(attn_weight) + attn_output = self.matmul(attn_weight, value_states, False, self.transpose_value) + + attn_output = self.transpose(attn_output, [0, 2, 1, 3]) + attn_output = self.reshape(attn_output, [1, seq_len, hidden_size]) + + attn_output = self.linear( + attn_output, hidden_size, hidden_size, bias=False, wt_dtype=self.dtype + ) + + return attn_output, new_key_states, new_value_states + + def mlp(self, hidden_states): + mm1 = self.linear( + hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype + ) + mm2 = self.linear( + hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype + ) # type: ignore[attr-defined] + mm1 = self.eltwise_mul(self.swish(mm1), mm2) # type: ignore[attr-defined] + hidden_states = self.linear( + mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype + ) + return hidden_states + + def layer_norm(self, hidden_states, layernorm_weight): + hidden_states = self.convert_to_fp32(hidden_states) + variance = self.reduce_mean( + self.power(hidden_states, self.constant(np.array([[2]], dtype=np.float32))), + -1, + keep_dims=True, + ) + eps = self.constant(self.rms_norm_eps) + hidden_states = self.eltwise_div(hidden_states, self.sqrt(self.eltwise_add(variance, eps))) + layernorm_weight = self.convert_to_fp32(layernorm_weight) + hidden_states = self.eltwise_mul(layernorm_weight, hidden_states) + hidden_states = self.convert_to_fp16(hidden_states) + return hidden_states + + def rotate_half(self, x, *, num_heads, seq_len, head_dim): + x1 = self.slice( + x, + [0, 0, 0, 0], + [1, num_heads, seq_len, head_dim // 2], + ) + x2 = self.slice( + x, + [0, 0, 0, head_dim // 2], + [1, num_heads, seq_len, head_dim], + ) + return self.concat(self.negative(x2), x1, axis=-1) + + def apply_rotary_pos_emb(self, *, q, k, cos, sin, position_ids, + num_heads, seq_len, head_dim): + position_ids = self.squeeze(position_ids) + cos = self.gather(cos, self.convert_to_int32(position_ids), self.constant(1), 0) + sin = self.gather(sin, self.convert_to_int32(position_ids), self.constant(1), 0) + cos = self.unsqueeze(cos, [1]) + sin = self.unsqueeze(sin, [1]) + + rotate_half_q = self.rotate_half(q, + num_heads=num_heads, + seq_len=seq_len, + head_dim=head_dim) + rotate_half_k = self.rotate_half(k, + num_heads=num_heads, + seq_len=seq_len, + head_dim=head_dim) + + q_embed = self.eltwise_add( + self.eltwise_mul(q, cos), self.eltwise_mul(rotate_half_q, sin) + ) + k_embed = self.eltwise_add( + self.eltwise_mul(k, cos), self.eltwise_mul(rotate_half_k, sin) + ) + + return q_embed, k_embed + + def repeat_kv(self, *, hidden_states, n_rep, num_key_value_heads, + kv_seq_len, head_dim, transpose=False): + if n_rep == 1: + return hidden_states + if not transpose: + hidden_states = self.reshape( + hidden_states, + [1, num_key_value_heads, 1, kv_seq_len, head_dim], + ) + hidden_states = self.broadcast( + hidden_states, + [1, num_key_value_heads, n_rep, kv_seq_len, head_dim], + ) + hidden_states = self.reshape( + hidden_states, + [1, n_rep * num_key_value_heads, kv_seq_len, head_dim], + ) + else: + hidden_states = self.reshape( + hidden_states, + [1, num_key_value_heads, 1, head_dim, kv_seq_len], + ) + hidden_states = self.broadcast( + hidden_states, + [1, num_key_value_heads, n_rep, head_dim, kv_seq_len], + ) + hidden_states = self.reshape( + hidden_states, + [1, n_rep * num_key_value_heads, head_dim, kv_seq_len], + ) + return hidden_states + + def create_cache_op(self, shape): + invalidInputError(len(self.linear_ops) == 0, + "create_cache_op should be called before any linear op") + op = super().parameter(shape) + self.cache_parameter_ops.append(op) + return op + + def create_input_op(self, shape): + invalidInputError(len(self.cache_parameter_ops) == 0, + "create_input_op should be called before any create_cache_op") + invalidInputError(len(self.linear_ops) == 0, + "create_input_op should be called before any linear op") + + op = super().parameter(shape) + self.input_ops.append(op) + return op + + def linear(self, *args, **kwargs): + op = super().linear(*args, **kwargs) + self.linear_ops.append(op) + return op + + def parameter(self, shape): + invalidInputError(False, + ("parameter should not be called directly, " + "use create_cache_op or create_input_op instead")) + + def update_cache(self, past_key_value, indexes): + + if self.kv_cache_c_handle is not None: + curr_ptr = self.kv_cache_torch[0].storage().data_ptr() + new_ptr = past_key_value.key_cache[indexes[0]].storage().data_ptr() + if curr_ptr != new_ptr: + backend_lib.destroyParameters(self.kv_cache_c_handle) + self.kv_cache_c_handle = None + self.kv_cache_torch = [] + if self.kv_cache_c_handle is None: + for idx in indexes: + past_key = past_key_value.key_cache[idx] + past_value = past_key_value.value_cache[idx] + invalidInputError( + past_key.dtype == torch.float16, f"past_key dtype is {past_key.dtype}" + ) + new_size = (past_key.size(0), past_key.size(1), self.max_seq_len, past_key.size(3)) + past_key = past_key.as_strided(new_size, past_key.stride(), storage_offset=0) + invalidInputError(past_key.is_contiguous(), "past_key is not contiguous") + past_value = past_value.as_strided(new_size, past_value.stride(), storage_offset=0) + if self.transpose_value: + past_value = past_value.transpose(-1, -2) + invalidInputError(past_value.is_contiguous(), "past_value is not contiguous") + + self.kv_cache_torch.append(past_key) + self.kv_cache_torch.append(past_value) + + layer_kv_cache_np = [p.numpy() for p in self.kv_cache_torch] + invalidInputError(len(self.cache_parameter_ops) == len(layer_kv_cache_np), + (f"kv_cache size does not match graph, " + f"with kv_cache size: {len(layer_kv_cache_np)} and" + f" graph size: {len(self.cache_parameter_ops)}") + ) + self.kv_cache_c_handle = self.create_parameters(layer_kv_cache_np) + self.load_cache_async() + + def load_cache_async(self): + self.load_wt_fn(len(self.input_ops), self._mm, self.kv_cache_c_handle) + + def set_weights(self, op_id, weights): + self.set_weights_async(op_id, weights) + with FileLock(f"decoder_run.lock"): + backend_lib.run(self._mm) + + def set_weights_async(self, op_id, weights): + offset = len(self.input_ops) + len(self.cache_parameter_ops) + invalidInputError(len(weights) == len(self.linear_ops), + (f"weights size does not match graph, " + f"with weights size: {len(weights)} and " + f" graph linear size: {len(self.linear_ops)}")) + self.setWeights(offset, op_id, *weights) + + @staticmethod + def run_decoders(inputs, decoders): + x_np = [elem.to(torch.float16).numpy() for elem in inputs] + + num_decoders = len(decoders) + num_inputs = len(x_np) + + with record_function(f"npu_factory"): + + array_type = ctypes.POINTER(ctypes.c_char) * num_decoders + models_ptr = array_type( + *[decoders[i]._mm for i in range(num_decoders)] + ) + inputs_ptr = (ctypes.c_void_p * num_inputs)( + *[x.ctypes.data_as(ctypes.c_void_p) for x in x_np] + ) + backend_lib.run_decoders(models_ptr, inputs_ptr, num_decoders, num_inputs) + + hidden_states = decoders[-1].torch_out[0] + new_key_states = [] + new_value_states = [] + for i in range(num_decoders): + for j in range(1, len(decoders[i].torch_out)): + if j % 2 == 1: + new_key_states.append(decoders[i].torch_out[j]) + else: + new_value_states.append(decoders[i].torch_out[j]) + return hidden_states, new_key_states, new_value_states diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py new file mode 100644 index 00000000000..9ddad9391cd --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -0,0 +1,1137 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time + +from typing import Optional, Sequence, List, Union, Any, Tuple +import numpy as np + +from transformers.cache_utils import Cache +from ipex_llm.utils.common import invalidInputError +from typing import Optional, List, Generator +import uuid +from functools import partial +import torch.nn.functional as F +import torch.nn.parallel +import torch.distributed as dist + +from transformers.utils import logging + +logger = logging.get_logger(__name__) +from colorama import Fore, Back, Style +import torch.multiprocessing as mp +from transformers.cache_utils import Cache +from transformers.modeling_outputs import BaseModelOutputWithPast +from ipex_llm.transformers.npu_models.mp_models_base import run_model +from ipex_llm.transformers.npu_models.mp_models_base import LLMBaseNNFactory +from ipex_llm.transformers.npu_models.common import reshape_lm_head_input +from transformers.modeling_outputs import CausalLMOutputWithPast +from torch.nn import CrossEntropyLoss +from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP + + +def split_mlp_down_proj(module: torch.nn.Module): + if isinstance(module, Qwen2MLP) and module.down_proj.in_features == 18944: + new_linear_0 = torch.nn.Linear(0, 0, bias=False) + new_weight_0 = torch.nn.Parameter(module.down_proj.weight[:, :9472], requires_grad=False) + new_linear_0.weight = new_weight_0 + new_linear_0.in_features = new_weight_0.size(1) + new_linear_0.out_features = new_weight_0.size(0) + module.down_proj_0 = new_linear_0 + new_linear_1 = torch.nn.Linear(0, 0, bias=False) + new_weight_1 = torch.nn.Parameter(module.down_proj.weight[:, 9472:], requires_grad=False) + new_linear_1.weight = new_weight_1 + new_linear_1.in_features = new_weight_1.size(1) + new_linear_1.out_features = new_weight_1.size(0) + module.down_proj_1 = new_linear_1 + + del module.down_proj + + +def split_mlp_forward(self, x): + h = self.act_fn(self.gate_proj(x)) * self.up_proj(x) + return self.down_proj_0(h[:, :, :9472]) + self.down_proj_1(h[:, :, 9472:]) + + +class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory): + def __init__( + self, + # batch_size: int, + # seq_len: int, + # hidden_size: int, + hidden_shape: Sequence[int], + *shapes, + num_heads: int, + num_key_value_heads: int, + num_layers: int, + cached_cos, + cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + q_biases=None, + k_biases=None, + v_biases=None, + mode: str = "prefill", + dtype: np.dtype = np.int8, + max_seq_len: int = 1024, + transpose_value: bool = False, + profile: bool = False, + device: str = "NPU", + rms_norm_eps, + intermediate_size, + ): + super().__init__(max_seq_len=max_seq_len, + transpose_value=transpose_value, + dtype=dtype, + profile=profile, + device=device) + self.max_seq_len = max_seq_len + self.intermediate_size = intermediate_size + self.dtype = dtype + self.cached_cos = cached_cos + self.cached_sin = cached_sin + self.batch_size, self.seq_len, self.hidden_size = hidden_shape + self.mode = mode + self.rms_norm_eps = rms_norm_eps + self.transpose_value = transpose_value + self.num_layers = num_layers + + cos = self.constant(self.cached_cos) + self.cos = self.unsqueeze(cos, axis=0) + + sin = self.constant(self.cached_sin) + self.sin = self.unsqueeze(sin, axis=0) + + if mode == "decode": + self.kv_seq_len = self.max_seq_len + 1 + else: + self.kv_seq_len = self.seq_len + + self.num_heads = num_heads + self.num_key_value_heads = num_key_value_heads + + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + + # define input, the order self.parameter matters + input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size)) + + # Self Attention + if mode == "decode": + attention_mask = self.create_input_op((self.batch_size, 1, 1, self.max_seq_len + 1)) + else: + attention_mask = self.create_input_op((self.batch_size, 1, self.seq_len, self.seq_len)) + + position_ids = self.create_input_op((self.batch_size, self.seq_len)) + past_keys = [] + past_values = [] + if mode == "decode": + for i in range(num_layers): + past_key = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + if transpose_value: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.head_dim, self.max_seq_len) + ) + else: + past_value = self.create_cache_op( + (self.batch_size, self.num_key_value_heads, self.max_seq_len, self.head_dim) + ) + past_keys.append(past_key) + past_values.append(past_value) + else: + past_keys = [None] * num_layers + past_values = [None] * num_layers + + if input_layernorm_weights is None: + input_layernorm_weights = [] + post_attn_layernorm_weights = [] + for i in range(num_layers): + input_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + post_attn_layernorm_weights.append( + self.create_input_op( + ( + 1, + self.hidden_size, + ) + ) + ) + else: + input_layernorm_weights = [self.constant(w) for w in input_layernorm_weights] + post_attn_layernorm_weights = [self.constant(w) for w in post_attn_layernorm_weights] + + if q_biases is None: + q_biases = [] + k_biases = [] + v_biases = [] + for i in range(num_layers): + q_biases.append(self.create_input_op((self.num_heads * self.head_dim,))) + k_biases.append(self.create_input_op((self.num_key_value_heads * self.head_dim,))) + v_biases.append(self.create_input_op((self.num_key_value_heads * self.head_dim,))) + else: + q_biases = [self.constant(w) for w in q_biases] + k_biases = [self.constant(w) for w in k_biases] + v_biases = [self.constant(w) for w in v_biases] + + hidden_states = input + + curr_key_values = [] + for i in range(num_layers): + hidden_states, new_key_states, new_value_states = self.build_decoder( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + input_layernorm_weight=input_layernorm_weights[i], + post_attention_layernorm_weight=post_attn_layernorm_weights[i], + q_bias=q_biases[i], + k_bias=k_biases[i], + v_bias=v_biases[i], + past_key=past_keys[i], + past_value=past_values[i], + ) + curr_key_values.append((new_key_states, new_value_states)) + + # define outputs + hidden_states = self.convert_to_fp16(hidden_states) + + for i in range(num_layers): + new_key_states = self.convert_to_fp16(curr_key_values[i][0]) + new_value_states = self.convert_to_fp16(curr_key_values[i][1]) + + print("start compiling") + self.compile() + print("end compiling") + + def mlp(self, hidden_states, seq_len): + mm1 = self.linear( + hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype + ) + mm2 = self.linear( + hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype + ) # type: ignore[attr-defined] + mm1 = self.eltwise_mul(self.swish(mm1), mm2) # type: ignore[attr-defined] + if self.intermediate_size == 18944: + # for qwen2-7b + mm1_0 = self.slice(mm1, begin=[0, 0, 0], end=[1, seq_len, 9472]) + mm1_1 = self.slice(mm1, begin=[0, 0, 9472], end=[1, seq_len, 18944]) + hidden_states_0 = self.linear(mm1_0, self.hidden_size, 9472, + bias=False, wt_dtype=self.dtype) + hidden_states_1 = self.linear(mm1_1, self.hidden_size, 9472, + bias=False, wt_dtype=self.dtype) + hidden_states = hidden_states_0 + hidden_states_1 + else: + hidden_states = self.linear( + mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype + ) + return hidden_states + + def build_decoder( + self, + hidden_states, + attention_mask, + position_ids, + input_layernorm_weight, + post_attention_layernorm_weight, + q_bias, + k_bias, + v_bias, + past_key=None, + past_value=None, + ): + + residual = hidden_states + input_2d = self.reshape(hidden_states, (self.batch_size * self.seq_len, self.hidden_size)) + input_2d = self.layer_norm(input_2d, input_layernorm_weight) + attn_output, new_key_states, new_value_states = self.attention( + hidden_states=input_2d, + position_ids=position_ids, + attention_mask=attention_mask, + past_key=past_key, + past_value=past_value, + cos=self.cos, + sin=self.sin, + mode=self.mode, + num_heads=self.num_heads, + num_key_value_heads=self.num_key_value_heads, + head_dim=self.head_dim, + seq_len=self.seq_len, + q_bias=q_bias, + k_bias=k_bias, + v_bias=v_bias, + ) + hidden_states = self.eltwise_add(residual, attn_output) + residual = hidden_states + hidden_states = self.layer_norm(hidden_states, post_attention_layernorm_weight) + hidden_states = self.mlp(hidden_states, self.seq_len) + hidden_states = self.eltwise_add(residual, hidden_states) + hidden_states = self.convert_to_fp16(hidden_states) + + return hidden_states, new_key_states, new_value_states + + +class FusedQwenLowBitMultiDecoderlayer(torch.nn.Module): + + def __init__( + self, + parameters: List[Tuple[torch.Tensor]], + input_laynorm_weights: List[torch.Tensor], + post_attn_layernorm_weights: List[torch.Tensor], + q_biases: List[torch.Tensor], + k_biases: List[torch.Tensor], + v_biases: List[torch.Tensor], + layer_indexes: List[int], + intra_stages: int, + cached_cos: torch.Tensor, + cached_sin: torch.Tensor, + num_heads: int, + head_dim: int, + num_key_value_heads: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 1024, + transpose_value: bool = False, + do_print: bool = False, + ): + super().__init__() + + self.do_print = do_print + + op_parameters = [] + for w in parameters: + if isinstance(w, tuple): # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy())) + else: + op_parameters.append(w.to(torch.float16).numpy()) + self.op_parameters = op_parameters + self.op_id = str(uuid.uuid4()) + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + if isinstance(parameters[0], tuple): + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.intra_stages = intra_stages + self.layer_indexes = layer_indexes + num_layers = len(self.layer_indexes) // intra_stages + self.layer_ranges = [] + for i in range(intra_stages): + if i == intra_stages - 1: + self.layer_ranges.append((i * num_layers, len(self.layer_indexes))) + else: + self.layer_ranges.append((i * num_layers, (i + 1) * num_layers)) + + self.backend_decoders = [] + + for i in range(intra_stages): + start, end = self.layer_ranges[i] + lm_0 = input_laynorm_weights[start:end] + lm_1 = post_attn_layernorm_weights[start:end] + decoder = LowBitQwenMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=lm_0, + post_attn_layernorm_weights=lm_1, + q_biases=q_biases[start:end], + k_biases=k_biases[start:end], + v_biases=v_biases[start:end], + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=end - start, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="decode", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.backend_decoders.append(decoder) + + offset = 0 + for i in range(intra_stages): + start, end = self.layer_ranges[i] + curr_linear_ops = len(self.backend_decoders[i].linear_ops) + curr_parameters = self.op_parameters[offset:offset + curr_linear_ops] + self.backend_decoders[i].set_weights(self.op_id, curr_parameters) + offset = offset + curr_linear_ops + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> torch.Tensor: + + inputs = ( + hidden_states.to(torch.float16), + attention_mask, + position_ids, + ) + + for i in range(self.intra_stages): + start, end = self.layer_ranges[i] + self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end]) + + hidden_states, new_keys, new_values = LowBitQwenMultiDecoderlayer.run_decoders( + inputs, + decoders=self.backend_decoders) + + if self.do_print: + print("outputs:", hidden_states) + + outputs = (hidden_states,) + outputs += (past_key_value, new_keys, new_values) + return outputs + + def post_forward(self, past_key_value, new_keys, new_values): + key_value_states = [] + for i in range(self.intra_stages): + for j in range(1, len(self.backend_decoders[i].torch_out)): + key_value_states.append(self.backend_decoders[i].torch_out[j]) + + cache_kwargs = { + "max_seq_len": self.max_seq_len, + "transpose": self.transpose_value, + } + for i in range(len(self.layer_indexes)): + key_states, value_states = past_key_value.update( + new_keys[i], + new_values[i], + self.layer_indexes[i], + cache_kwargs, + ) + + for i in range(self.intra_stages): + self.backend_decoders[i].load_cache_async() + + +class FusedQwenLowBitDecoderlayer(torch.nn.Module): + def __init__( + self, + parameters: List[torch.Tensor], + cached_cos, + cached_sin, + layer_norm_0, + layer_norm_1, + q_bias, + k_bias, + v_bias, + num_heads: int, + num_key_value_heads: int, + layer_idx: int, + rms_norm_eps, + intermediate_size, + max_seq_len: int = 128, + transpose_value: bool = False, + ): + super().__init__() + self.op_parameters = parameters + self.op_id = str(uuid.uuid4()) + self.layer_idx = layer_idx + self.max_seq_len = max_seq_len + self.transpose_value = transpose_value + # self.rotary_emb = rotary_emb + if isinstance(parameters[0], tuple): # weight, scale from QuantizedLinear + np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + self.backend_cls_prefill = partial( + LowBitQwenMultiDecoderlayer, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=1, + cached_cos=cached_cos, + cached_sin=cached_sin, + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + max_seq_len=max_seq_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="prefill", + transpose_value=self.transpose_value, + dtype=np_dtype, + ) + self.layer_norm_0 = layer_norm_0 + self.layer_norm_1 = layer_norm_1 + self.q_bias = q_bias + self.k_bias = k_bias + self.v_bias = v_bias + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> torch.Tensor: + """Torch module forward method. + + Args: + x (torch.Tensor): Input tensor + + Returns: + torch.Tensor: result + """ + + seq_len = hidden_states.shape[1] + + backend_cls = self.backend_cls_prefill + inputs = (hidden_states.to(torch.float16), attention_mask, position_ids) + inputs += (self.layer_norm_0, self.layer_norm_1) + inputs += (self.q_bias, self.k_bias, self.v_bias) + hidden_states, past_key, past_value = run_model( + inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + ) + cache_kwargs = {"max_seq_len": self.max_seq_len, "transpose": self.transpose_value} + key_states, value_states = past_key_value.update( + past_key, past_value, self.layer_idx, cache_kwargs + ) + + outputs = (hidden_states,) + outputs += (past_key_value,) + return outputs + + +def run_decode( + model, + rank, + world_size, + port, + layer_start, + layer_end, + intra_stages, + max_seq_len, + transpose_value_cache, + input_queue, + result_queue, +): + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + print("start init process group, rank: ", rank, "world_size: ", world_size) + + dist.init_process_group() + my_rank = dist.get_rank() + my_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {my_size}") + + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + q_biases = [] + k_biases = [] + v_biases = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + if model.config.intermediate_size == 8960: + # for qwen2-1.5b + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + elif model.config.intermediate_size == 18944: + # for qwen2-7b + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj_0.weight, mlp_layer.down_proj_0.scale), + (mlp_layer.down_proj_1.weight, mlp_layer.down_proj_1.scale) + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + q_biases.append(attn_layer.q_proj.bias.to(torch.float16)) + k_biases.append(attn_layer.k_proj.bias.to(torch.float16)) + v_biases.append(attn_layer.v_proj.bias.to(torch.float16)) + + multi_decoder = FusedQwenLowBitMultiDecoderlayer( + parameters=layer_weights, + input_laynorm_weights=input_layer_norm_weights, + post_attn_layernorm_weights=post_attn_layernorm_weights, + q_biases=q_biases, + k_biases=k_biases, + v_biases=v_biases, + layer_indexes=layer_indexs, + intra_stages=intra_stages, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + head_dim=head_dim, + num_key_value_heads=num_key_value_heads, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_seq_len, + transpose_value=transpose_value_cache, + do_print=False, + ) + + dist.barrier() + + past_key_values = None + + control = torch.empty((), dtype=torch.int) + hidden_states = torch.empty((1, 1, head_dim * num_heads), dtype=torch.float16) + with torch.inference_mode(): + while True: + + dist.broadcast(control, src=0) + if control.item() == -2: + break + elif control.item() == -1: + past_key_values = input_queue.get() + else: + t0 = time.perf_counter() + past_seen_tokens = past_key_values.get_seq_length() + attention_mask = torch.ones([1, past_seen_tokens + 1], dtype=torch.int64) + position_ids = torch.arange( + past_seen_tokens, + 1 + past_seen_tokens, + dtype=torch.long, + device=hidden_states.device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, 1) + + from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask + + causal_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (hidden_states.shape[0], hidden_states.shape[1]), + hidden_states, + past_seen_tokens, + sliding_window=model.model.config.sliding_window, + ) + pad_len = multi_decoder.max_seq_len + 1 - causal_mask.size(-1) + + causal_mask[:, :, :, -1] = torch.finfo(torch.float16).min + pad_mask = (0, pad_len) + padded_causal_mask = F.pad( + causal_mask.to(torch.float16), pad_mask, value=torch.finfo(torch.float16).min + ) + padded_causal_mask[:, :, :, -1] = 0.0 + dist.recv(hidden_states, src=rank - 1) + t1 = time.perf_counter() + layer_outputs = multi_decoder( + hidden_states, + attention_mask=padded_causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + ) + t2 = time.perf_counter() + hidden_states = layer_outputs[0] + t3 = time.perf_counter() + dist.send(hidden_states, dst=(rank + 1) % world_size) + t4 = time.perf_counter() + past_key_values = layer_outputs[1] + new_keys = layer_outputs[2] + new_values = layer_outputs[3] + multi_decoder.post_forward(past_key_values, new_keys, new_values) + + +class DecodeRunner: + def __init__(self, model, max_seq_len, intra_pp=2, inter_pp=2, transpose_value_cache=True): + self.model = model + self.max_seq_len = max_seq_len + self.transpose_value_cache = transpose_value_cache + world_size = inter_pp + 1 + intra_stages = intra_pp + num_layers = self.model.model.config.num_hidden_layers + + port = "54791" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = port + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = str(world_size) + + self.input_queues = [] + self.output_queues = [] + self.decoder_processes = [] + + for rank in range(1, world_size): + input_q = mp.Queue() + output_q = mp.Queue() + start_layer = (rank - 1) * (num_layers // (world_size - 1)) + end_layer = (rank) * (num_layers // (world_size - 1)) + if rank == world_size - 1: + end_layer = num_layers + p = mp.Process( + target=run_decode, + args=( + self.model, + rank, + world_size, + port, + start_layer, + end_layer, + intra_stages, + self.max_seq_len, + self.transpose_value_cache, + input_q, + output_q, + ), + ) + p.daemon = True + p.start() + self.input_queues.append(input_q) + self.output_queues.append(output_q) + self.decoder_processes.append(p) + + dist.init_process_group() + my_rank = dist.get_rank() + self.world_size = dist.get_world_size() + logger.info(f"rank: {my_rank}, size: {self.world_size}") + + dist.barrier() + self.cache_past_key_value = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + t0 = time.perf_counter() + + if self.cache_past_key_value != past_key_value: + control = torch.tensor(-1, dtype=torch.int) + dist.broadcast(control, src=0) + for i in range(len(self.decoder_processes)): + self.input_queues[i].put(past_key_value) + + control = torch.tensor(0, dtype=torch.int) + dist.broadcast(control, src=0) + hidden_states = hidden_states.to(torch.float16) + dist.send(hidden_states, dst=1) + past_key_value.expand(self.transpose_value_cache) + dist.recv(hidden_states, src=self.world_size - 1) + t1 = time.perf_counter() + return hidden_states, past_key_value + + def shutdown(self): + control = torch.tensor(-2, dtype=torch.int) + dist.broadcast(control, src=0) + for p in self.decoder_processes: + p.join(3) + for p in self.decoder_processes: + if p.exitcode is None: + p.kill() + + def __del__(self): + self.shutdown() + + +def run_prefill( + model, max_output_len, max_prompt_len, transpose_value_cache, input_queue, result_queue +): + + layer_start = 0 + layer_end = len(model.model.layers) + num_heads = model.model.layers[layer_start].self_attn.num_heads + num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads + head_dim = model.model.layers[layer_start].self_attn.head_dim + rms_norm_eps = model.config.rms_norm_eps + intermediate_size = model.config.intermediate_size + deocderlayers = [] + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + layer_indexs = range(layer_start, layer_end) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + if model.config.intermediate_size == 8960: + # for qwen2-1.5b + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj.weight, mlp_layer.down_proj.scale), + ] + elif model.config.intermediate_size == 18944: + # for qwen2-7b + weights = [ + (attn_layer.q_proj.weight, attn_layer.q_proj.scale), + (attn_layer.k_proj.weight, attn_layer.k_proj.scale), + (attn_layer.v_proj.weight, attn_layer.v_proj.scale), + (attn_layer.o_proj.weight, attn_layer.o_proj.scale), + (mlp_layer.gate_proj.weight, mlp_layer.gate_proj.scale), + (mlp_layer.up_proj.weight, mlp_layer.up_proj.scale), + (mlp_layer.down_proj_0.weight, mlp_layer.down_proj_0.scale), + (mlp_layer.down_proj_1.weight, mlp_layer.down_proj_1.scale) + ] + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + new_decoderlayer = FusedQwenLowBitDecoderlayer( + weights, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + cached_cos=cached_cos, + cached_sin=cached_sin, + layer_norm_0=layer_norm_0, + layer_norm_1=layer_norm_1, + q_bias=attn_layer.q_proj.bias.to(torch.float16), + k_bias=attn_layer.k_proj.bias.to(torch.float16), + v_bias=attn_layer.v_proj.bias.to(torch.float16), + layer_idx=layer_idx, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + max_seq_len=max_output_len, + transpose_value=transpose_value_cache, + ) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + model.model.layers[layer_idx] = new_decoderlayer + deocderlayers.append(new_decoderlayer) + + print("finish creating all decode layers in prefill") + result_queue.put("loading finish") + + while True: + + result = input_queue.get() + if result == "stop": + break + + hidden_states, position_ids, causal_mask, past_key_values = result + with torch.inference_mode(): + for decoder_layer in deocderlayers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=False, + use_cache=True, + ) + + hidden_states = layer_outputs[0] + next_decoder_cache = layer_outputs[1] + result_queue.put((hidden_states, next_decoder_cache)) + + +class PrefillRunner: + def __init__(self, model, max_output_len, max_prompt_len, transpose_value_cache): + self.model = model + self.max_output_len = max_output_len + self.max_prompt_len = max_prompt_len + self.transpose_value_cache = transpose_value_cache + + self.prefill_result_queue = mp.Queue() + self.prefill_input_queue = mp.Queue() + + self.p = mp.Process( + target=run_prefill, + args=( + model, + max_output_len, + max_prompt_len, + transpose_value_cache, + self.prefill_input_queue, + self.prefill_result_queue, + ), + ) + self.p.daemon = True + self.p.start() + output = self.prefill_result_queue.get() + print(Fore.GREEN + f"prefill process output: {output}") + print(Style.RESET_ALL) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + seq_len = hidden_states.size(1) + invalidInputError( + seq_len <= self.max_prompt_len, + ( + f"seq_len: {seq_len} should be less than or equal" + " to max_prompt_len {self.max_prompt_len}" + ), + ) + self.prefill_input_queue.put((hidden_states, position_ids, attention_mask, past_key_value)) + return self.prefill_result_queue.get() + + def shutdown(self): + self.prefill_input_queue.put("stop") + self.p.join(3) + if self.p.exitcode is None: + self.p.kill() + + def __del__(self): + self.shutdown() + + +def gen_qwen2_fused_model_forward(prefill_runner, decode_runner): + + def qwen2_fused_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + invalidInputError(False, + "You cannot specify both decoder_input_ids and " + "decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + invalidInputError(False, + "You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + past_key_values_length = 0 + + from ipex_llm.transformers.npu_models.kv import DynamicFusedNormalCache + + if use_cache and not isinstance(past_key_values, DynamicFusedNormalCache): + past_key_values = DynamicFusedNormalCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_seq_length() + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + if seq_length == 1: + layers_runner = decode_runner + else: + layers_runner = prefill_runner + layer_outputs = layers_runner.forward( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + next_decoder_cache = layer_outputs[1] + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + return qwen2_fused_model_forward + + +def qwen2_casullm_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None \ + else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + # cache_position=cache_position, + ) + + hidden_states = outputs[0] + # ipex-llm change start + hidden_states = reshape_lm_head_input(hidden_states) + # ipex-llm change end + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py index 8202b3ee0ee..87167d81573 100644 --- a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py +++ b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py @@ -800,7 +800,7 @@ async def stream_output(self, cur_batch, tokenizer, next_ids): _stream_tasks.append(self.streamer[request_id].put((remain, printable_text))) await asyncio.gather(*_stream_tasks) - async def process_step(self, tokenizer, result_dict): + async def process_step(self, tokenizer, result_dict, processor=None): cur_batch = None torch.xpu.synchronize(self.device) if self.rank == 0: diff --git a/python/llm/src/ipex_llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py index 6d2e08423ad..4600e99fefc 100644 --- a/python/llm/src/ipex_llm/transformers/speculative.py +++ b/python/llm/src/ipex_llm/transformers/speculative.py @@ -460,12 +460,16 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=False): if version.parse(trans_version) >= version.parse("4.36.0"): - from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache - if isinstance(past_key_values, (DynamicFp8Cache, DynamicNormalCache)): + from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache,\ + DynamicCompressCache + if isinstance(past_key_values, (DynamicFp8Cache, DynamicNormalCache, + DynamicCompressCache)): if hasattr(past_key_values, "_seen_tokens"): past_key_values._seen_tokens -= new_cache_size else: past_key_values.seen_tokens -= new_cache_size + if isinstance(past_key_values, DynamicCompressCache): + past_key_values.real_kv_len -= new_cache_size for i, k in enumerate(past_key_values.key_cache): past_key_values.key_cache[i] = k[:, :, :-new_cache_size, :] @@ -489,12 +493,19 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa for k, v in past_key_values ] elif self.config.model_type == "chatglm": - # for chatglm, cache shape is [sl, bs, nh, hn] - past_key_values = [ - (k[:-(new_cache_size), :, :, :], - v[:-(new_cache_size), :, :, :]) - for k, v in past_key_values - ] + if self.config.num_layers == 40 and hasattr(self.config, 'rope_ratio'): + past_key_values = [ + (k[:, :, :-(new_cache_size), :], + v[:, :, :-(new_cache_size), :]) + for k, v in past_key_values + ] + else: + # for chatglm, cache shape is [sl, bs, nh, hn] + past_key_values = [ + (k[:-(new_cache_size), :, :, :], + v[:-(new_cache_size), :, :, :]) + for k, v in past_key_values + ] elif self.config.model_type in ["baichuan", "gptj"]: past_key_values = [ (k[:, :, :-(new_cache_size), :], diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index cf8c561232f..5cd706c2e5b 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -382,3 +382,16 @@ def check_hidden_size(qtype, hidden_size): "required for fq6_k - using fallback quantization fp6.") return ggml_tensor_qtype["fp6"] return qtype + + +# Arc platfrom does not support FP64, +# Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method +# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage3.py#L1365 +def _constant_buffered_norm2(self, input, buffer_size=250000000): + norm = None + for part in input.view(-1).split(buffer_size): + if norm is None: + norm = part.data.norm(2)**2.0 + else: + norm += part.data.norm(2)**2.0 + return norm**0.5 diff --git a/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py b/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py index d64631f1f4c..8e74b4507c5 100644 --- a/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py +++ b/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py @@ -2452,7 +2452,7 @@ def greedy_search( last_token_time.append(end - st) # stop if we exceed the maximum length - if stopping_criteria(input_ids, scores): + if stopping_criteria(input_ids, scores)[0]: this_peer_finished = True if this_peer_finished and not synced_gpus: diff --git a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py index 5065f1c1ab1..c9cae077cc2 100644 --- a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py @@ -114,7 +114,6 @@ async def create_chat_completion(request: ChatCompletionRequest, return StreamingResponse(content=generator, media_type="text/event-stream") else: - invalidInputError(isinstance(generator, ChatCompletionResponse)) return JSONResponse(content=generator.model_dump()) diff --git a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py index ff6515426d8..164cd1d4228 100644 --- a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py @@ -254,8 +254,8 @@ def _ipex_llm_load_model(self) -> None: scheduler_config=self.scheduler_config) return - _model_mlp_convert() - _model_attention_convert() + # _model_mlp_convert() + # _model_attention_convert() self.model = get_model( model_config=self.model_config, diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 85719ccdc84..065652e7162 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -30,7 +30,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.parallel_utils.communication_op import tensor_model_parallel_gather -from typing import Tuple, Optional +from typing import Tuple, Optional, Union from ipex_llm.utils.common import invalidInputError from vllm.sequence import SamplerOutput @@ -51,8 +51,10 @@ def _Qwen2_sample( sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: if self.config.tie_word_embeddings: - lm_head_weight = self.model.embed_tokens + # Embedding layer is not optimized to LowBitLinear + lm_head_weight = self.model.embed_tokens.weight else: + # This layer is optimized to LowBitLinear lm_head_weight = self.lm_head next_tokens = self.sampler(lm_head_weight, hidden_states, sampling_metadata) @@ -70,9 +72,15 @@ def _Chatglm_sample( return next_tokens -def _sample_get_logits(self, hidden_states: torch.Tensor, embedding: torch.nn.Module, +def _sample_get_logits(self, hidden_states: torch.Tensor, + embedding: Union[torch.nn.Module, torch.Tensor], embedding_bias: Optional[torch.Tensor]) -> torch.Tensor: - logits = embedding(hidden_states) + # For tie_word_embedding models, the embedding is not optimized as + # the low_bit_linear layer... + if isinstance(embedding, torch.Tensor): + logits = torch.matmul(hidden_states, embedding.t()) + else: + logits = embedding(hidden_states) if embedding_bias is not None: logits += embedding_bias logits = tensor_model_parallel_gather(logits) @@ -225,8 +233,8 @@ def _ipex_llm_convert(load_in_low_bit): def get_load_function(low_bit): def _ipex_llm_load_model(self) -> None: - _model_mlp_convert() - _model_attention_convert() + # _model_mlp_convert() + # _model_attention_convert() _model_sample_convert() from vllm.utils import measure_device_memory @@ -240,7 +248,16 @@ def _ipex_llm_load_model(self) -> None: parallel_config=self.parallel_config, scheduler_config=self.scheduler_config) from ipex_llm import optimize_model - optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype) + import os + not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) + is_glm4_model = "glm-4" in self.model_config.model.lower() + if not_convert_last_mlp is not None or is_glm4_model: + # only use to avoid nan value in last mlp forward running glm4-9b-chat + modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"] + else: + modules = None + optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype, + modules_to_not_convert=modules) self.model = self.model.to(device=self.device_config.device, dtype=self.model_config.dtype) diff --git a/python/llm/test/benchmark/arc-perf-test-batch2.yaml b/python/llm/test/benchmark/arc-perf-test-batch2.yaml deleted file mode 100644 index 70447fd7f59..00000000000 --- a/python/llm/test/benchmark/arc-perf-test-batch2.yaml +++ /dev/null @@ -1,30 +0,0 @@ -repo_id: - - 'meta-llama/Llama-2-7b-chat-hf' - - 'meta-llama/Llama-2-13b-chat-hf' - - 'THUDM/chatglm3-6b-4bit' - - 'baichuan-inc/Baichuan2-7B-Chat' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit' - - 'THUDM/glm-4-9b-chat' - - 'openbmb/MiniCPM-2B-sft-bf16' - - 'Qwen/Qwen-VL-Chat' - #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported - - '01-ai/Yi-6B-Chat' - - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - - '01-ai/Yi-1.5-6B-Chat' -local_model_hub: '/mnt/disk1/models' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 2 # default to 1 -in_out_pairs: - - '32-32' - - '1024-128' - - '2048-256' -test_api: - - "transformer_int4_fp16_gpu" # on Intel GPU -cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) -exclude: - - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' -task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-test-batch4.yaml b/python/llm/test/benchmark/arc-perf-test-batch4.yaml deleted file mode 100644 index 3bfd47963a4..00000000000 --- a/python/llm/test/benchmark/arc-perf-test-batch4.yaml +++ /dev/null @@ -1,36 +0,0 @@ -repo_id: - - 'meta-llama/Llama-2-7b-chat-hf' - - 'meta-llama/Llama-2-13b-chat-hf' - - 'THUDM/chatglm3-6b-4bit' - - 'baichuan-inc/Baichuan2-7B-Chat' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit' - - 'THUDM/glm-4-9b-chat' - - 'openbmb/MiniCPM-2B-sft-bf16' - - 'Qwen/Qwen-VL-Chat' - #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported - - '01-ai/Yi-6B-Chat' - - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - - '01-ai/Yi-1.5-6B-Chat' -local_model_hub: '/mnt/disk1/models' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 4 # default to 1 -in_out_pairs: - - '32-32' - - '1024-128' - - '2048-256' -test_api: - - "transformer_int4_fp16_gpu" # on Intel GPU -cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) -exclude: - - 'meta-llama/Llama-2-13b-chat-hf:2048' - - 'baichuan-inc/Baichuan2-7B-Chat:2048' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' - - 'Qwen/Qwen-VL-Chat:2048' -# - 'fnlp/moss-moon-003-sft-4bit:1024' -# - 'fnlp/moss-moon-003-sft-4bit:2048' -task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml deleted file mode 100644 index 890b8dbf470..00000000000 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ /dev/null @@ -1,32 +0,0 @@ -repo_id: - - 'meta-llama/Llama-2-7b-chat-hf' - - 'meta-llama/Llama-2-13b-chat-hf' - - 'THUDM/chatglm3-6b-4bit' - - 'baichuan-inc/Baichuan2-7B-Chat' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit' - - 'THUDM/glm-4-9b-chat' - - 'openbmb/MiniCPM-2B-sft-bf16' - - 'Qwen/Qwen-VL-Chat' - #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported - - '01-ai/Yi-6B-Chat' - - 'mistralai/Mistral-7B-Instruct-v0.2' - - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - - '01-ai/Yi-1.5-6B-Chat' -local_model_hub: '/mnt/disk1/models' -warm_up: 1 -num_trials: 3 -num_beams: 1 # default to greedy search -low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 -in_out_pairs: - - '32-32' - - '1024-128' - - '2048-256' -test_api: - - "transformer_int4_fp16_gpu" # on Intel GPU -cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) -exclude: -# - 'fnlp/moss-moon-003-sft-4bit:1024' -# - 'fnlp/moss-moon-003-sft-4bit:2048' - - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' -task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml b/python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml new file mode 100644 index 00000000000..42ef79f344c --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-transformers-436-batch2.yaml @@ -0,0 +1,16 @@ +repo_id: + - 'Qwen/Qwen-VL-Chat' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 2 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_fp16_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml b/python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml new file mode 100644 index 00000000000..606b9c6cf05 --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-transformers-436-batch4.yaml @@ -0,0 +1,18 @@ +repo_id: + - 'Qwen/Qwen-VL-Chat' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 4 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_fp16_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'Qwen/Qwen-VL-Chat:2048' +task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-436.yaml b/python/llm/test/benchmark/arc-perf-transformers-436.yaml new file mode 100644 index 00000000000..efdf14193a3 --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-transformers-436.yaml @@ -0,0 +1,16 @@ +repo_id: + - 'Qwen/Qwen-VL-Chat' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_fp16_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml index d675d506629..9b9ab1f14ae 100644 --- a/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml +++ b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml @@ -6,6 +6,18 @@ repo_id: - 'microsoft/phi-3-vision-128k-instruct' - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm3-6b-4bit' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' + - 'THUDM/glm-4-9b-chat' + - 'openbmb/MiniCPM-2B-sft-bf16' + #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported + - '01-ai/Yi-6B-Chat' + - 'mistralai/Mistral-7B-Instruct-v0.2' + - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' + - '01-ai/Yi-1.5-6B-Chat' local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 @@ -19,4 +31,6 @@ in_out_pairs: test_api: - "transformer_int4_fp16_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-437-batch4.yaml b/python/llm/test/benchmark/arc-perf-transformers-437-batch4.yaml index f3d55c83e35..368a8c636b5 100644 --- a/python/llm/test/benchmark/arc-perf-transformers-437-batch4.yaml +++ b/python/llm/test/benchmark/arc-perf-transformers-437-batch4.yaml @@ -6,6 +6,18 @@ repo_id: - 'microsoft/phi-3-vision-128k-instruct' - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm3-6b-4bit' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' + - 'THUDM/glm-4-9b-chat' + - 'openbmb/MiniCPM-2B-sft-bf16' + #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported + - '01-ai/Yi-6B-Chat' + - 'mistralai/Mistral-7B-Instruct-v0.2' + - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' + - '01-ai/Yi-1.5-6B-Chat' local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 @@ -22,4 +34,8 @@ cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu w exclude: - 'Qwen/Qwen1.5-7B-Chat:2048' - 'meta-llama/Meta-Llama-3-8B-Instruct:2048' -task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' \ No newline at end of file + - 'meta-llama/Llama-2-13b-chat-hf:2048' + - 'baichuan-inc/Baichuan2-7B-Chat:2048' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' +task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml index 1c775344c43..bca87891f6b 100644 --- a/python/llm/test/benchmark/arc-perf-transformers-437.yaml +++ b/python/llm/test/benchmark/arc-perf-transformers-437.yaml @@ -6,6 +6,18 @@ repo_id: - 'microsoft/phi-3-vision-128k-instruct' - 'Qwen/Qwen2-7B-Instruct' - 'microsoft/Phi-3-mini-128k-instruct' + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm3-6b-4bit' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' + - 'THUDM/glm-4-9b-chat' + - 'openbmb/MiniCPM-2B-sft-bf16' + #- 'SmerkyG/rwkv-5-world-7b' #this model only fp32 is supported for now, fp16 and bf16 are not supported + - '01-ai/Yi-6B-Chat' + - 'mistralai/Mistral-7B-Instruct-v0.2' + - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' + - '01-ai/Yi-1.5-6B-Chat' local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 @@ -19,4 +31,6 @@ in_out_pairs: test_api: - "transformer_int4_fp16_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' diff --git a/python/llm/test/benchmark/concat_csv.py b/python/llm/test/benchmark/concat_csv.py index 908f71f1aed..442eab7cc9a 100644 --- a/python/llm/test/benchmark/concat_csv.py +++ b/python/llm/test/benchmark/concat_csv.py @@ -36,7 +36,7 @@ def main(): merged_df = pd.concat([pd.read_csv(file, index_col=0) for file in csv_files], ignore_index=True) merged_df.reset_index(drop=True, inplace=True) - merged_csv = csv_files[0].replace("_test1", "").replace("_test2", "").replace("_test3", "") + merged_csv = csv_files[0].replace("_test1", "").replace("_test2", "").replace("_test3", "").replace("_test4", "") merged_df.to_csv(merged_csv) if __name__ == "__main__": diff --git a/python/llm/test/benchmark/core-perf-test.yaml b/python/llm/test/benchmark/core-perf-test.yaml index 55f738de54b..2def68c1494 100644 --- a/python/llm/test/benchmark/core-perf-test.yaml +++ b/python/llm/test/benchmark/core-perf-test.yaml @@ -3,7 +3,7 @@ repo_id: - 'THUDM/chatglm3-6b' - 'baichuan-inc/Baichuan2-7B-Chat' - 'internlm/internlm-chat-7b' - - 'Qwen/Qwen-7B-Chat' + # - 'Qwen/Qwen-7B-Chat' # requires transformers < 4.37.0 - 'BAAI/AquilaChat2-7B' - 'meta-llama/Llama-2-7b-chat-hf' - 'WisdomShell/CodeShell-7B' diff --git a/python/llm/test/benchmark/cpu-perf-test.yaml b/python/llm/test/benchmark/cpu-perf-test.yaml index 92b12750dbb..2dc7cb002e9 100644 --- a/python/llm/test/benchmark/cpu-perf-test.yaml +++ b/python/llm/test/benchmark/cpu-perf-test.yaml @@ -5,7 +5,7 @@ repo_id: - 'THUDM/chatglm3-6b' - 'baichuan-inc/Baichuan2-7B-Chat' - 'baichuan-inc/Baichuan2-13B-Chat' - - 'Qwen/Qwen-14B-Chat' + # - 'Qwen/Qwen-14B-Chat' # requires transformers < 4.37.0 local_model_hub: '/mnt/disk1/models' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml index b0bd5f30c20..759a7566237 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml @@ -10,9 +10,15 @@ repo_id: - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - 'RWKV/v5-Eagle-7B-HF' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_436.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/1024-128_437.yaml rename to python/llm/test/benchmark/igpu-perf/1024-128_436.yaml index f191801c7dc..c967f66a7ba 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_436.yaml @@ -1,9 +1,5 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'Qwen/Qwen-VL-Chat' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_443.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_443.yaml new file mode 100644 index 00000000000..4667ff34c3a --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/1024-128_443.yaml @@ -0,0 +1,14 @@ +repo_id: + - 'google/gemma-2-2b-it' + - 'google/gemma-2-9b-it' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '1024-128' +test_api: + - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml index 39d575680ab..f66172d9a39 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml @@ -9,9 +9,15 @@ repo_id: - 'mistralai/Mistral-7B-Instruct-v0.2' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml rename to python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml index f9db9131ca3..c224b65e745 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml @@ -1,9 +1,5 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'Qwen/Qwen-VL-Chat' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml new file mode 100644 index 00000000000..2f4bbd2270d --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml @@ -0,0 +1,14 @@ +repo_id: + - 'google/gemma-2-2b-it' + - 'google/gemma-2-9b-it' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '1024-128' +test_api: + - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml index 2730e465d47..76c35d4dde7 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml @@ -9,9 +9,14 @@ repo_id: - 'mistralai/Mistral-7B-Instruct-v0.2' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml rename to python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml index abd17aaa1e2..917e6d0ff3c 100644 --- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_436.yaml @@ -1,9 +1,5 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'Qwen/Qwen-VL-Chat' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml new file mode 100644 index 00000000000..8d8e16c5c42 --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_443.yaml @@ -0,0 +1,14 @@ +repo_id: + - 'google/gemma-2-2b-it' + - 'google/gemma-2-9b-it' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '1024-128' +test_api: + - "transformer_int4_fp16_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml index c53e6283919..bf5fc1e978b 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml @@ -9,9 +9,15 @@ repo_id: - 'mistralai/Mistral-7B-Instruct-v0.2' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml rename to python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml index fd4fbbfaec1..e9566c13250 100644 --- a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml @@ -1,9 +1,5 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'Qwen/Qwen-VL-Chat' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml new file mode 100644 index 00000000000..3f8e554d19d --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml @@ -0,0 +1,14 @@ +repo_id: + - 'google/gemma-2-2b-it' + - 'google/gemma-2-9b-it' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '2048-256' +test_api: + - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml index 47b9839a789..60202594cba 100644 --- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml @@ -8,9 +8,15 @@ repo_id: - 'mistralai/Mistral-7B-Instruct-v0.2' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml new file mode 100644 index 00000000000..6448a358cb5 --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_436.yaml @@ -0,0 +1,13 @@ +repo_id: + - 'Qwen/Qwen-VL-Chat' +local_model_hub: 'path to your local model hub' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '3072-384' +test_api: + - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_437.yaml rename to python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml index cfd7cc31afa..9e72a9e0a8e 100644 --- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16_443.yaml @@ -1,9 +1,6 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'google/gemma-2-2b-it' + # - 'google/gemma-2-9b-it' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml index 39115e0231b..e70178744a3 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml @@ -9,9 +9,15 @@ repo_id: - 'mistralai/Mistral-7B-Instruct-v0.2' - 'deepseek-ai/deepseek-coder-7b-instruct-v1.5' - '01-ai/Yi-6B-Chat' - - 'Qwen/Qwen-VL-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 3 num_trials: 5 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml rename to python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml index 93fdc926e5f..8faf43aed97 100644 --- a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml @@ -1,9 +1,5 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'Qwen/Qwen-VL-Chat' local_model_hub: 'path to your local model hub' warm_up: 3 num_trials: 5 diff --git a/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml new file mode 100644 index 00000000000..a02b19b1cf2 --- /dev/null +++ b/python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml @@ -0,0 +1,14 @@ +repo_id: + - 'google/gemma-2-2b-it' + - 'google/gemma-2-9b-it' +local_model_hub: 'path to your local model hub' +warm_up: 3 +num_trials: 5 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 1 # default to 1 +in_out_pairs: + - '32-32' +test_api: + - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows (catch GPU peak memory) +cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml index 26e128a564c..514037a7380 100644 --- a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml +++ b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16.yaml @@ -10,6 +10,13 @@ repo_id: - '01-ai/Yi-6B-Chat' - 'openbmb/MiniCPM-1B-sft-bf16' - 'openbmb/MiniCPM-2B-sft-bf16' + - 'Qwen/Qwen1.5-7B-Chat' + - 'Qwen/Qwen2-1.5B-Instruct' + - 'Qwen/Qwen2-7B-Instruct' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'microsoft/Phi-3-mini-128k-instruct' + - 'microsoft/phi-3-vision-128k-instruct' + - 'openbmb/MiniCPM-V-2_6' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_437.yaml b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml similarity index 71% rename from python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_437.yaml rename to python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml index 7c2632d3d96..94f3ec43e8f 100644 --- a/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_437.yaml +++ b/python/llm/test/benchmark/igpu-perf/4096-512_int4_fp16_443.yaml @@ -1,9 +1,6 @@ repo_id: - - 'Qwen/Qwen1.5-7B-Chat' - - 'Qwen/Qwen2-7B-Instruct' - - 'microsoft/Phi-3-mini-4k-instruct' - - 'microsoft/Phi-3-mini-128k-instruct' - - 'microsoft/phi-3-vision-128k-instruct' + - 'google/gemma-2-2b-it' + # - 'google/gemma-2-9b-it' local_model_hub: 'path to your local model hub' warm_up: 1 num_trials: 3 diff --git a/python/llm/test/inference_gpu/test_transformers_api.py b/python/llm/test/inference_gpu/test_transformers_api.py index ae9c6b9bc3e..b29c25997ae 100644 --- a/python/llm/test/inference_gpu/test_transformers_api.py +++ b/python/llm/test/inference_gpu/test_transformers_api.py @@ -36,7 +36,7 @@ (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MPT_7B_ORIGIN_PATH')), # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')), # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')), - # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), + # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), # qwen requires transformers<4.37.0 ]) def test_completion(Model, Tokenizer, model_path, prompt, answer): with torch.inference_mode(): diff --git a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py index f45f017ef0b..edb2adf1ec0 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py +++ b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py @@ -32,7 +32,7 @@ ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')), ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')), ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')), - ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), + # ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), # qwen requires transformers<4.37.0 ] class Test_Optimize_Gpu_Model: diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py index 4db5ba8b531..c18a52bb201 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_attention.py +++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py @@ -34,7 +34,7 @@ ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')), ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')), ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')), - ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), + # ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), # qwen requires transformers<4.37.0 ] class Test_Optimize_Gpu_Model: @@ -151,7 +151,7 @@ def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path): # currently only compare the output of the last self-attention layer. layer_norm = "model.layers.31.input_layernorm" self_attn = "model.layers.31.self_attn" - lower_bound = 8e-3 + lower_bound = 2e-2 self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound) def Falcon_7B_gpu_model(self, Name, Model, Tokenizer, model_path): @@ -165,7 +165,7 @@ def Chatglm2_gpu_model(self, Name, Model, Tokenizer, model_path): # currently only need to compare the output of one self-attention layer. layer_norm = "transformer.encoder.layers.27.input_layernorm" self_attn = "transformer.encoder.layers.27.self_attention" - lower_bound = 4e-2 + lower_bound = 1e-1 self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound) def Mistral_gpu_model(self, Name, Model, Tokenizer, model_path): @@ -182,7 +182,7 @@ def Baichuan_gpu_model(self, Name, Model, Tokenizer, model_path): # currently only need to compare the output of one self-attention layer. layer_norm = "model.layers.31.input_layernorm" self_attn = "model.layers.31.self_attn" - lower_bound = 8e-3 + lower_bound = 2e-2 self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, self_attn, layer_norm, lower_bound) def Qwen_gpu_model(self, Name, Model, Tokenizer, model_path): diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py index cf0581a50c0..d46d939a8ef 100644 --- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py +++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py @@ -27,7 +27,7 @@ PROMPT = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" TEST_MODEL_LIST = [ - ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), + # ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')), # qwen requires transformers<4.37.0 ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')), ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')) ] @@ -134,7 +134,7 @@ def Mistral_7B_Instruct_gpu_model(self, Name, Model, Tokenizer, model_path): # currently only compare the output of the last mlp layer. layer_before_MLP = "model.layers.31.post_attention_layernorm" MLP_layer = "model.layers.31.mlp" - lower_bound = 0 + lower_bound = 1e-3 self.run_optimize_gpu_model(Name, Model, Tokenizer, model_path, MLP_layer, layer_before_MLP, lower_bound) def Llama2_7B_gpu_model(self, Name, Model, Tokenizer, model_path): diff --git a/python/llm/version.txt b/python/llm/version.txt index eb5820cd2d6..6b959d99e8a 100644 --- a/python/llm/version.txt +++ b/python/llm/version.txt @@ -1 +1 @@ -2.1.0.dev0 +2.2.0.dev0