merge

intel-analytics · May 8, 2024 · 8638cea · 8638cea
2 parents 4b04c45 + 11df5f9
commit 8638cea
Show file tree

Hide file tree

Showing 76 changed files with 7,860 additions and 418 deletions.
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
@@ -375,7 +375,7 @@ jobs:
         shell: bash
         run: |
           python -m pip uninstall datasets -y
-          python -m pip install transformers==4.34.0 datasets peft==0.5.0 accelerate==0.23.0
+          python -m pip install transformers==4.36.0 datasets peft==0.10.0 accelerate==0.23.0
           python -m pip install bitsandbytes scipy
           # Specific oneapi position on arc ut test machines
           if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then
@@ -422,4 +422,4 @@ jobs:
         if: ${{ always() }}
         shell: bash
         run: |
-          pip uninstall sentence-transformers -y || true
+          pip uninstall sentence-transformers -y || true
diff --git a/.github/workflows/manually_build_for_testing.yml b/.github/workflows/manually_build_for_testing.yml
@@ -18,6 +18,7 @@ on:
         - ipex-llm-finetune-qlora-cpu
         - ipex-llm-finetune-qlora-xpu
         - ipex-llm-xpu
+        - ipex-llm-cpp-xpu
         - ipex-llm-cpu
         - ipex-llm-serving-xpu
         - ipex-llm-serving-cpu
@@ -150,6 +151,35 @@ jobs:
         sudo docker push 10.239.45.10/arda/${image}:${TAG}
         sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 
+  ipex-llm-cpp-xpu:
+    if: ${{ github.event.inputs.artifact == 'ipex-llm-cpp-xpu' || github.event.inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+
+    steps:
+    - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+      with:
+        ref: ${{ github.event.inputs.sha }}
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: ipex-llm-cpp-xpu
+      run: |
+        echo "##############################################################"
+        echo "####### ipex-llm-cpp-xpu ########"
+        echo "##############################################################"
+        export image=intelanalytics/ipex-llm-cpp-xpu
+        cd docker/llm/cpp/
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker push ${image}:${TAG}
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+
   ipex-llm-cpu:
     if: ${{ github.event.inputs.artifact == 'ipex-llm-cpu' || github.event.inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]

diff --git a/README.md b/README.md
@@ -183,6 +183,8 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM
 | DeciLM-7B | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/deciLM-7b) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/deciLM-7b) |
 | Deepseek | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/deepseek) |
 | StableLM | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/stablelm) |
+| CodeGemma | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma) |
+| Command-R/cohere | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/cohere) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/cohere) |
 
 ## Get Support
 - Please report a bug or raise a feature request by opening a [Github Issue](https://github.com/intel-analytics/ipex-llm/issues)

diff --git a/docker/llm/README.md b/docker/llm/README.md
@@ -28,7 +28,7 @@ This guide provides step-by-step instructions for installing and using IPEX-LLM
 
 ### 1. Prepare ipex-llm-cpu Docker Image
 
-Run the following command to pull image from dockerhub:
+Run the following command to pull image:
 ```bash
 docker pull intelanalytics/ipex-llm-cpu:2.1.0-SNAPSHOT
 ```

diff --git a/docker/llm/finetune/qlora/cpu/docker/Dockerfile b/docker/llm/finetune/qlora/cpu/docker/Dockerfile
@@ -20,7 +20,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h
 
 RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \
     # Install python 3.11.1
-    apt-get update && apt-get install -y curl wget gpg gpg-agent software-properties-common git gcc g++ make libunwind8-dev zlib1g-dev libssl-dev libffi-dev && \
+    apt-get update && \
+    apt-get install -y curl wget gpg gpg-agent software-properties-common git gcc g++ make libunwind8-dev libbz2-dev zlib1g-dev libssl-dev libffi-dev && \
     mkdir -p /opt/python && \
     cd /opt/python && \
     wget https://www.python.org/ftp/python/3.11.1/Python-3.11.1.tar.xz && \
@@ -39,15 +40,16 @@ RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \
     rm -rf /var/lib/apt/lists/* && \
     pip install --upgrade pip && \
     export PIP_DEFAULT_TIMEOUT=100 && \
-    pip install --upgrade torch==2.1.0 && \
+    # install torch CPU version
+    pip install --upgrade torch==2.1.0 --index-url https://download.pytorch.org/whl/cpu && \
     # install CPU ipex-llm
     pip install --pre --upgrade ipex-llm[all] && \
     # install ipex and oneccl
     pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/intel_extension_for_pytorch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl && \
     pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \
     # install huggingface dependencies
-    pip install datasets transformers==4.35.0 && \
-    pip install fire peft==0.5.0 && \
+    pip install datasets transformers==4.36.0 && \
+    pip install fire peft==0.10.0 && \
     pip install accelerate==0.23.0 && \
     pip install bitsandbytes && \
     # get qlora example code

diff --git a/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s b/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s
@@ -61,8 +61,8 @@ RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \
     pip install intel_extension_for_pytorch==2.0.100 && \
     pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable && \
     # install huggingface dependencies
-    pip install datasets transformers==4.35.0 && \
-    pip install fire peft==0.5.0 && \
+    pip install datasets transformers==4.36.0 && \
+    pip install fire peft==0.10.0 && \
     pip install accelerate==0.23.0 && \
     # install basic dependencies
     apt-get update && apt-get install -y curl wget gpg gpg-agent && \

diff --git a/docker/llm/finetune/qlora/xpu/docker/Dockerfile b/docker/llm/finetune/qlora/xpu/docker/Dockerfile
@@ -3,7 +3,7 @@ ARG http_proxy
 ARG https_proxy
 ENV TZ=Asia/Shanghai
 ARG PIP_NO_CACHE_DIR=false
-ENV TRANSFORMERS_COMMIT_ID=95fe0f5
+ENV TRANSFORMERS_COMMIT_ID=1466677
 
 # retrive oneapi repo public key
 RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
@@ -33,7 +33,7 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P
     pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \
     # install huggingface dependencies
     pip install git+https://github.com/huggingface/transformers.git@${TRANSFORMERS_COMMIT_ID} && \
-    pip install peft==0.5.0 datasets accelerate==0.23.0 && \
+    pip install peft==0.6.0 datasets accelerate==0.23.0 && \
     pip install bitsandbytes scipy && \
     git clone https://github.com/intel-analytics/IPEX-LLM.git && \
     mv IPEX-LLM/python/llm/example/GPU/LLM-Finetuning/common /common && \

diff --git a/docs/readthedocs/source/_templates/sidebar_quicklinks.html b/docs/readthedocs/source/_templates/sidebar_quicklinks.html
@@ -37,6 +37,9 @@
                     <li>
                         <a href="doc/LLM/Quickstart/continue_quickstart.html">Run Coding Copilot (Continue) in VSCode with Intel GPU</a>
                     </li>
+                    <li>
+                        <a href="doc/LLM/Quickstart/dify_quickstart.html">Run Dify on Intel GPU</a>
+                    </li>
                     <li>
                         <a href="doc/LLM/Quickstart/open_webui_with_ollama_quickstart.html">Run Open WebUI with IPEX-LLM on Intel GPU</a>
                     </li>
@@ -58,6 +61,13 @@
                     <li>
                         <a href="doc/LLM/Quickstart/axolotl_quickstart.html">Finetune LLM with Axolotl on Intel GPU</a>
                     </li>
+                    <li>
+                        <a href="doc/LLM/Quickstart/privateGPT_quickstart.html">Run PrivateGPT with IPEX-LLM on Intel GPU</a>
+                     </li>
+                     <li>
+                        <a href="doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart.html">Run IPEX-LLM serving on Multiple Intel GPUs
+                            using DeepSpeed AutoTP and FastApi</a>
+                    </li>
                 </ul>
             </li>
             <li>

diff --git a/docs/readthedocs/source/_toc.yml b/docs/readthedocs/source/_toc.yml
@@ -26,13 +26,16 @@ subtrees:
                 - file: doc/LLM/Quickstart/chatchat_quickstart
                 - file: doc/LLM/Quickstart/webui_quickstart
                 - file: doc/LLM/Quickstart/open_webui_with_ollama_quickstart
+                - file: doc/LLM/Quickstart/privateGPT_quickstart
                 - file: doc/LLM/Quickstart/continue_quickstart
+                - file: doc/LLM/Quickstart/dify_quickstart
                 - file: doc/LLM/Quickstart/benchmark_quickstart
                 - file: doc/LLM/Quickstart/llama_cpp_quickstart
                 - file: doc/LLM/Quickstart/ollama_quickstart
                 - file: doc/LLM/Quickstart/llama3_llamacpp_ollama_quickstart
                 - file: doc/LLM/Quickstart/fastchat_quickstart
                 - file: doc/LLM/Quickstart/axolotl_quickstart
+                - file: doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart
           - file: doc/LLM/Overview/KeyFeatures/index
             title: "Key Features"
             subtrees:

diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/axolotl_quickstart.md
@@ -33,7 +33,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl/tree/v0.4.0
 cd axolotl
 # replace requirements.txt
 remove requirements.txt
-wget -O requirements.txt https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
+wget -O requirements.txt https://raw.githubusercontent.com/intel-analytics/ipex-llm/main/python/llm/example/GPU/LLM-Finetuning/axolotl/requirements-xpu.txt
 pip install -e .
 pip install transformers==4.36.0
 # to avoid https://github.com/OpenAccess-AI-Collective/axolotl/issues/1544
@@ -92,7 +92,14 @@ Configure oneAPI variables by running the following command:
 
 ```
 
-Configure accelerate to avoid training with CPU
+Configure accelerate to avoid training with CPU. You can download a default `default_config.yaml` with `use_cpu: false`.
+
+```cmd
+mkdir -p  ~/.cache/huggingface/accelerate/
+wget -O ~/.cache/huggingface/accelerate/default_config.yaml https://raw.githubusercontent.com/intel-analytics/ipex-llm/main/python/llm/example/GPU/LLM-Finetuning/axolotl/default_config.yaml
+```
+
+As an alternative, you can config accelerate based on your requirements.
 
 ```cmd
 accelerate config

diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/chatchat_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/chatchat_quickstart.md
@@ -39,17 +39,30 @@ Follow the guide that corresponds to your specific system and device from the li
 #### Step 1: Create Knowledge Base
 
 - Select `Manage Knowledge Base` from the menu on the left, then choose `New Knowledge Base` from the dropdown menu on the right side.
-  <p align="center"><img src="https://llm-assets.readthedocs.io/en/latest/_images/new-kb.png" alt="image1" width="70%" align="center"></p>
+
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/new-kb.png" target="_blank">
+    <img src="https://llm-assets.readthedocs.io/en/latest/_images/new-kb.png" alt="rag-menu" width="100%" align="center">
+  </a>
+
 - Fill in the name of your new knowledge base (example: "test") and press the `Create` button. Adjust any other settings as needed.
-  <p align="center"><img src="https://llm-assets.readthedocs.io/en/latest/_images/create-kb.png" alt="image1" width="70%" align="center"></p>
+
+  <a href="https://llm-assets.readthedocs.io/en/latest/_images/create-kb.png" target="_blank">
+    <img src="https://llm-assets.readthedocs.io/en/latest/_images/create-kb.png" alt="rag-menu" width="100%" align="center">
+  </a>
+
 - Upload knowledge files from your computer and allow some time for the upload to complete. Once finished, click on `Add files to Knowledge Base` button to build the vector store. Note: this process may take several minutes.
-  <p align="center"><img src="https://llm-assets.readthedocs.io/en/latest/_images/build-kb.png" alt="image1" width="70%" align="center"></p>
+
+  <a href="https://llm-assets.readthedocs.io/en/latest/_images/build-kb.png" target="_blank">
+    <img src="https://llm-assets.readthedocs.io/en/latest/_images/build-kb.png" alt="rag-menu" width="100%" align="center">
+  </a>
 
 #### Step 2: Chat with RAG
 
 You can now click `Dialogue` on the left-side menu to return to the chat UI. Then in `Knowledge base settings` menu, choose the Knowledge Base you just created, e.g, "test". Now you can start chatting.
 
-<p align="center"><img src="https://llm-assets.readthedocs.io/en/latest/_images/rag-menu.png" alt="rag-menu" width="60%" align="center"></p>
+<a href="https://llm-assets.readthedocs.io/en/latest/_images/rag-menu.png" target="_blank">
+  <img src="https://llm-assets.readthedocs.io/en/latest/_images/rag-menu.png" alt="rag-menu" width="100%" align="center">
+</a>
 
 <br/>
 

diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/deepspeed_autotp_fastapi_quickstart.md
@@ -0,0 +1,102 @@
+# Run IPEX-LLM serving on Multiple Intel GPUs using DeepSpeed AutoTP and FastApi
+
+This example demonstrates how to run IPEX-LLM serving on multiple [Intel GPUs](../README.md) by leveraging DeepSpeed AutoTP.
+
+## Requirements
+
+To run this example with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine.
+
+## Example
+
+### 1. Install
+
+```bash
+conda create -n llm python=3.11
+conda activate llm
+# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
+pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+# configures OneAPI environment variables
+source /opt/intel/oneapi/setvars.sh
+pip install git+https://github.com/microsoft/DeepSpeed.git@ed8aed5
+pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@0eb734b
+pip install mpi4py fastapi uvicorn
+conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc
+```
+
+> **Important**: IPEX 2.1.10+xpu requires Intel® oneAPI Base Toolkit's version == 2024.0. Please make sure you have installed the correct version.
+
+### 2. Run tensor parallel inference on multiple GPUs
+
+When we run the model in a distributed manner across two GPUs, the memory consumption of each GPU is only half of what it was originally, and the GPUs can work simultaneously during inference computation.
+
+We provide example usage for `Llama-2-7b-chat-hf` model running on Arc A770
+
+Run Llama-2-7b-chat-hf on two Intel Arc A770:
+
+```bash
+
+# Before run this script, you should adjust the YOUR_REPO_ID_OR_MODEL_PATH in last line
+# If you want to change server port, you can set port parameter in last line
+
+# To avoid GPU OOM, you could adjust --max-num-seqs and --max-num-batched-tokens parameters in below script
+bash run_llama2_7b_chat_hf_arc_2_card.sh
+```
+
+If you successfully run the serving, you can get output like this:
+
+```bash
+[0] INFO:     Started server process [120071]
+[0] INFO:     Waiting for application startup.
+[0] INFO:     Application startup complete.
+[0] INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+> **Note**: You could change `NUM_GPUS` to the number of GPUs you have on your machine. And you could also specify other low bit optimizations through `--low-bit`.
+
+### 3. Sample Input and Output
+
+We can use `curl` to test serving api
+
+```bash
+# Set http_proxy and https_proxy to null to ensure that requests are not forwarded by a proxy.
+export http_proxy=
+export https_proxy=
+
+curl -X 'POST' \
+  'http://127.0.0.1:8000/generate/' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "prompt": "What is AI?",
+  "n_predict": 32
+}'
+```
+
+And you should get output like this:
+
+```json
+{
+  "generated_text": "What is AI? Artificial intelligence (AI) refers to the development of computer systems able to perform tasks that would normally require human intelligence, such as visual perception, speech",
+  "generate_time": "0.45149803161621094s"
+}
+
+```
+
+**Important**: The first token latency is much larger than rest token latency, you could use [our benchmark tool](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/dev/benchmark/README.md) to obtain more details about first and rest token latency.
+
+### 4. Benchmark with wrk
+
+We use wrk for testing end-to-end throughput, check [here](https://github.com/wg/wrk).
+
+You can install by:
+```bash
+sudo apt install wrk
+```
+
+Please change the test url accordingly.
+
+```bash
+# set t/c to the number of concurrencies to test full throughput.
+wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate/ --timeout 1m
+```