-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Enable single card sync engine * enable ipex-llm optimizations for vllm * enable optimizations for lm_head * Fix chatglm multi-reference problem * update 0.5.4 api_server * add dockerfile * fix * fix * refine * fix --------- Co-authored-by: gc-fu <[email protected]>
- Loading branch information
Showing
15 changed files
with
2,051 additions
and
103 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
FROM intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04 | ||
|
||
ARG http_proxy | ||
ARG https_proxy | ||
|
||
# Disable pip's cache behavior | ||
ARG PIP_NO_CACHE_DIR=false | ||
ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch | ||
ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch | ||
|
||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ | ||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ | ||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ | ||
rm /etc/apt/sources.list.d/intel-graphics.list && \ | ||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ | ||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ | ||
chmod 644 /usr/share/keyrings/intel-graphics.gpg && \ | ||
apt-get update && \ | ||
apt-get install -y --no-install-recommends curl wget git libunwind8-dev vim less && \ | ||
# Install PYTHON 3.11 and IPEX-LLM[xpu] | ||
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ | ||
env DEBIAN_FRONTEND=noninteractive apt-get update && \ | ||
# add-apt-repository requires gnupg, gpg-agent, software-properties-common | ||
apt-get install -y --no-install-recommends gnupg gpg-agent software-properties-common && \ | ||
# Add Python 3.11 PPA repository | ||
add-apt-repository ppa:deadsnakes/ppa -y && \ | ||
apt-get install -y --no-install-recommends python3.11 git curl wget && \ | ||
rm /usr/bin/python3 && \ | ||
ln -s /usr/bin/python3.11 /usr/bin/python3 && \ | ||
ln -s /usr/bin/python3 /usr/bin/python && \ | ||
apt-get install -y --no-install-recommends python3-pip python3.11-dev python3-wheel python3.11-distutils && \ | ||
wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py && \ | ||
# Install FastChat from source requires PEP 660 support | ||
python3 get-pip.py && \ | ||
rm get-pip.py && \ | ||
pip install --upgrade requests argparse urllib3 && \ | ||
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \ | ||
# Fix Trivy CVE Issues | ||
pip install transformers==4.36.2 && \ | ||
pip install transformers_stream_generator einops tiktoken && \ | ||
# # Install opencl-related repos | ||
# apt-get update && \ | ||
# apt-get install -y --no-install-recommends intel-opencl-icd=23.35.27191.42-775~22.04 intel-level-zero-gpu=1.3.27191.42-775~22.04 level-zero=1.14.0-744~22.04 && \ | ||
# Install related libary of chat.py | ||
pip install --upgrade colorama && \ | ||
# Download all-in-one benchmark and examples | ||
git clone https://github.com/intel-analytics/ipex-llm && \ | ||
cp -r ./ipex-llm/python/llm/dev/benchmark/ ./benchmark && \ | ||
cp -r ./ipex-llm/python/llm/example/GPU/HuggingFace/LLM ./examples && \ | ||
# Install vllm dependencies | ||
pip install --upgrade fastapi && \ | ||
pip install --upgrade "uvicorn[standard]" && \ | ||
# Download vLLM-Serving | ||
cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving | ||
|
||
|
||
# Install Serving Dependencies | ||
# Install ipex-llm[serving] only will update ipex_llm source code without updating | ||
# bigdl-core-xe, which will lead to problems | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \ | ||
mkdir -p /llm/neo && \ | ||
cd /llm/neo && \ | ||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15136.4/intel-igc-core_1.0.15136.4_amd64.deb && \ | ||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15136.4/intel-igc-opencl_1.0.15136.4_amd64.deb && \ | ||
wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-level-zero-gpu-dbgsym_1.3.27191.9_amd64.ddeb && \ | ||
wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-level-zero-gpu_1.3.27191.9_amd64.deb && \ | ||
wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-opencl-icd-dbgsym_23.35.27191.9_amd64.ddeb && \ | ||
wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/intel-opencl-icd_23.35.27191.9_amd64.deb && \ | ||
wget https://github.com/intel/compute-runtime/releases/download/23.35.27191.9/libigdgmm12_22.3.11.ci17747749_amd64.deb && \ | ||
dpkg -i *.deb && \ | ||
pip install --pre --upgrade ipex-llm[xpu,serving] && \ | ||
pip install transformers==4.37.0 gradio==4.19.2 && \ | ||
# Use ipex-vllm-mainline | ||
git clone -b vllm_202411_0807 https://github.com/xiangyuT/ipex-llm.git /llm/ipex-llm && \ | ||
cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/convert.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/convert.py && \ | ||
cp /llm/ipex-llm/python/llm/src/ipex_llm/transformers/low_bit_linear.py /usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/low_bit_linear.py && \ | ||
rm -rf /usr/local/lib/python3.11/dist-packages/ipex_llm/vllm && \ | ||
cp -r /llm/ipex-llm/python/llm/src/ipex_llm/vllm /usr/local/lib/python3.11/dist-packages/ipex_llm/ && \ | ||
# install ipex 2.1.30 | ||
python -m pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 oneccl_bind_pt==2.1.300+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ && \ | ||
python -m pip install setuptools==69.5.1 numpy==1.26.4 && \ | ||
# Install vLLM-v2 dependencies | ||
git clone -b xiangyu_test_202411_0806 https://github.com/analytics-zoo/vllm.git /llm/vllm && \ | ||
pip install -r /llm/vllm/requirements-common.txt && \ | ||
pip install -r /llm/vllm/requirements-xpu.txt && \ | ||
pip install --no-deps xformers && \ | ||
cd /llm/vllm && \ | ||
VLLM_TARGET_DEVICE=xpu python setup.py install && \ | ||
pip install outlines==0.0.34 --no-deps && \ | ||
pip install interegular cloudpickle diskcache joblib lark nest-asyncio numba scipy && \ | ||
# For Qwen series models support | ||
pip install transformers_stream_generator einops tiktoken && \ | ||
# For pipeline serving support | ||
pip install mpi4py fastapi uvicorn openai && \ | ||
# for gradio web UI | ||
pip install gradio && \ | ||
# Install internal oneccl && \ | ||
cd /tmp/ && \ | ||
pip install --upgrade setuptools wheel twine && \ | ||
pip install "setuptools<70.0.0" && \ | ||
git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \ | ||
cd torch-ccl && \ | ||
patch -p1 < /tmp/oneccl-binding.patch && \ | ||
git submodule sync && \ | ||
git submodule update --init --recursive && \ | ||
USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py install sdist bdist_wheel && \ | ||
mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl /tmp/ && \ | ||
cd /tmp/ && \ | ||
wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \ | ||
bash oneccl_wks_installer_2024.0.0.2.sh && \ | ||
pip uninstall -y oneccl_bind_pt && \ | ||
pip install /tmp/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl && \ | ||
rm /tmp/oneccl_bind_pt-2.1.300+xpu-cp311-cp311-linux_x86_64.whl && \ | ||
patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch && \ | ||
pip install -r /llm/vllm/requirements-common.txt && \ | ||
pip install ray | ||
|
||
COPY ./vllm_online_benchmark.py /llm/ | ||
COPY ./vllm_offline_inference.py /llm/ | ||
COPY ./payload-1024.lua /llm/ | ||
COPY ./start-vllm-service.sh /llm/ | ||
COPY ./benchmark_vllm_throughput.py /llm/ | ||
COPY ./start-fastchat-service.sh /llm/ | ||
COPY ./start-pp_serving-service.sh /llm/ | ||
COPY ./start-lightweight_serving-service.sh /llm/ | ||
|
||
|
||
WORKDIR /llm/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
## Build/Use IPEX-LLM-serving xpu image | ||
|
||
### Build Image | ||
```bash | ||
docker build \ | ||
--build-arg http_proxy=.. \ | ||
--build-arg https_proxy=.. \ | ||
--build-arg no_proxy=.. \ | ||
--rm --no-cache -t intelanalytics/ipex-llm-serving-xpu:2024.1.1 . | ||
``` | ||
|
||
|
||
### Use the image for doing xpu serving | ||
|
||
|
||
To map the `xpu` into the container, you need to specify `--device=/dev/dri` when booting the container. | ||
|
||
An example could be: | ||
```bash | ||
#/bin/bash | ||
export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT | ||
|
||
sudo docker run -itd \ | ||
--net=host \ | ||
--device=/dev/dri \ | ||
--name=CONTAINER_NAME \ | ||
--shm-size="16g" \ | ||
$DOCKER_IMAGE | ||
``` | ||
|
||
|
||
After the container is booted, you could get into the container through `docker exec`. | ||
|
||
To verify the device is successfully mapped into the container, run `sycl-ls` to check the result. In a machine with Arc A770, the sampled output is: | ||
|
||
```bash | ||
root@arda-arc12:/# sycl-ls | ||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000] | ||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i9-13900K 3.0 [2023.16.7.0.21_160000] | ||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics 3.0 [23.17.26241.33] | ||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26241] | ||
``` | ||
After the container is booted, you could get into the container through `docker exec`. | ||
|
||
Currently, we provide two different serving engines in the image, which are FastChat serving engine and vLLM serving engine. | ||
|
||
|
||
#### Lightweight serving engine | ||
|
||
To run Lightweight serving on one intel gpu using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving). | ||
|
||
For convenience, we have included a file `/llm/start-lightweight_serving-service` in the image. | ||
|
||
|
||
#### Pipeline parallel serving engine | ||
|
||
To run Pipeline parallel serving using `IPEX-LLM` as backend, you can refer to this [readme](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI). | ||
|
||
For convenience, we have included a file `/llm/start-pp_serving-service.sh` in the image. | ||
|
||
|
||
#### FastChat serving engine | ||
|
||
To run model-serving using `IPEX-LLM` as backend using FastChat, you can refer to this [quickstart](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/fastchat_quickstart.html#). | ||
|
||
For convenience, we have included a file `/llm/fastchat-examples/start-fastchat-service.sh` in the image. | ||
|
||
You can modify this script to using fastchat with either `ipex_llm_worker` or `vllm_worker`. | ||
|
||
#### vLLM serving engine | ||
|
||
To run vLLM engine using `IPEX-LLM` as backend, you can refer to this [document](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md). | ||
|
||
We have included multiple example files in `/llm/`: | ||
1. `vllm_offline_inference.py`: Used for vLLM offline inference example | ||
2. `benchmark_vllm_throughput.py`: Used for benchmarking throughput | ||
3. `payload-1024.lua`: Used for testing request per second using 1k-128 request | ||
4. `start-vllm-service.sh`: Used for template for starting vLLM service | ||
|
||
##### Online benchmark throurgh api_server | ||
|
||
We can benchmark the api_server to get an estimation about TPS (transactions per second). To do so, you need to start the service first according to the instructions in this [section](https://github.com/intel-analytics/ipex-llm/blob/main/python/llm/example/GPU/vLLM-Serving/README.md#service). | ||
|
||
###### Online benchmark through benchmark_util | ||
|
||
After starting vllm service, Sending reqs through `vllm_online_benchmark.py` | ||
```bash | ||
python vllm_online_benchmark.py $model_name $max_seqs | ||
``` | ||
|
||
And it will output like this: | ||
```bash | ||
model_name: Qwen1.5-14B-Chat | ||
max_seq: 12 | ||
Warm Up: 100%|█████████████████████████████████████████████████████| 24/24 [01:36<00:00, 4.03s/req] | ||
Benchmarking: 100%|████████████████████████████████████████████████| 60/60 [04:03<00:00, 4.05s/req] | ||
Total time for 60 requests with 12 concurrent requests: xxx seconds. | ||
Average responce time: xxx | ||
Token throughput: xxx | ||
|
||
Average first token latency: xxx milliseconds. | ||
P90 first token latency: xxx milliseconds. | ||
P95 first token latency: xxx milliseconds. | ||
|
||
Average next token latency: xxx milliseconds. | ||
P90 next token latency: xxx milliseconds. | ||
P95 next token latency: xxx milliseconds. | ||
``` | ||
|
||
###### Online benchmark through wrk | ||
In container, do the following: | ||
1. modify the `/llm/payload-1024.lua` so that the "model" attribute is correct. By default, we use a prompt that is roughly 1024 token long, you can change it if needed. | ||
2. Start the benchmark using `wrk` using the script below: | ||
|
||
```bash | ||
cd /llm | ||
# You can change -t and -c to control the concurrency. | ||
# By default, we use 12 connections to benchmark the service. | ||
wrk -t12 -c12 -d15m -s payload-1024.lua http://localhost:8000/v1/completions --timeout 1h | ||
``` | ||
|
||
#### Offline benchmark through benchmark_vllm_throughput.py | ||
|
||
We have included the benchmark_throughput script provied by `vllm` in our image as `/llm/benchmark_vllm_throughput.py`. To use the benchmark_throughput script, you will need to download the test dataset through: | ||
|
||
```bash | ||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
``` | ||
|
||
The full example looks like this: | ||
```bash | ||
cd /llm/ | ||
|
||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
|
||
export MODEL="YOUR_MODEL" | ||
|
||
# You can change load-in-low-bit from values in [sym_int4, fp8, fp16] | ||
|
||
python3 /llm/benchmark_vllm_throughput.py \ | ||
--backend vllm \ | ||
--dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
--model $MODEL \ | ||
--num-prompts 1000 \ | ||
--seed 42 \ | ||
--trust-remote-code \ | ||
--enforce-eager \ | ||
--dtype float16 \ | ||
--device xpu \ | ||
--load-in-low-bit sym_int4 \ | ||
--gpu-memory-utilization 0.85 | ||
``` | ||
|
||
> Note: you can adjust --load-in-low-bit to use other formats of low-bit quantization. | ||
|
||
You can also adjust `--gpu-memory-utilization` rate using the below script to find the best performance using the following script: | ||
|
||
```bash | ||
#!/bin/bash | ||
|
||
# Define the log directory | ||
LOG_DIR="YOUR_LOG_DIR" | ||
# Check if the log directory exists, if not, create it | ||
if [ ! -d "$LOG_DIR" ]; then | ||
mkdir -p "$LOG_DIR" | ||
fi | ||
|
||
# Define an array of model paths | ||
MODELS=( | ||
"YOUR TESTED MODELS" | ||
) | ||
|
||
# Define an array of utilization rates | ||
UTIL_RATES=(0.85 0.90 0.95) | ||
|
||
# Loop over each model | ||
for MODEL in "${MODELS[@]}"; do | ||
# Loop over each utilization rate | ||
for RATE in "${UTIL_RATES[@]}"; do | ||
# Extract a simple model name from the path for easier identification | ||
MODEL_NAME=$(basename "$MODEL") | ||
|
||
# Define the log file name based on the model and rate | ||
LOG_FILE="$LOG_DIR/${MODEL_NAME}_utilization_${RATE}.log" | ||
|
||
# Execute the command and redirect output to the log file | ||
# Sometimes you might need to set --max-model-len if memory is not enough | ||
# load-in-low-bit accepts inputs [sym_int4, fp8, fp16] | ||
python3 /llm/benchmark_vllm_throughput.py \ | ||
--backend vllm \ | ||
--dataset /llm/ShareGPT_V3_unfiltered_cleaned_split.json \ | ||
--model $MODEL \ | ||
--num-prompts 1000 \ | ||
--seed 42 \ | ||
--trust-remote-code \ | ||
--enforce-eager \ | ||
--dtype float16 \ | ||
--load-in-low-bit sym_int4 \ | ||
--device xpu \ | ||
--gpu-memory-utilization $RATE &> "$LOG_FILE" | ||
done | ||
done | ||
|
||
# Inform the user that the script has completed its execution | ||
echo "All benchmarks have been executed and logged." | ||
``` |
Oops, something went wrong.