Skip to content

Commit

Permalink
Update oneccl used (intel-analytics#11647)
Browse files Browse the repository at this point in the history
* Add internal oneccl

* fix

* fix

* add oneccl
  • Loading branch information
gc-fu authored Jul 26, 2024
1 parent a4d30a8 commit 86fc049
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 3 deletions.
33 changes: 32 additions & 1 deletion docker/llm/serving/xpu/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
FROM intelanalytics/ipex-llm-serving-xpu:latest as build

ARG http_proxy
ARG https_proxy

ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch

RUN cd /tmp/ && \
pip install --upgrade setuptools wheel twine && \
pip install "setuptools<70.0.0" && \
git clone https://github.com/intel/torch-ccl -b v2.1.100+xpu && \
cd torch-ccl && \
patch -p1 < /tmp/oneccl-binding.patch && \
git submodule sync && \
git submodule update --init --recursive && \
COMPUTE_BACKEND=dpcpp python setup.py sdist bdist_wheel && \
mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/


FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT

ARG http_proxy
ARG https_proxy

# Disable pip's cache behavior
ARG PIP_NO_CACHE_DIR=false
COPY --from=build /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/
ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch

# Install Serving Dependencies
# Install ipex-llm[serving] only will update ipex_llm source code without updating
# bigdl-core-xe, which will lead to problems
RUN apt-get update && \
apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \
apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
pip install --pre --upgrade ipex-llm[xpu,serving] && \
pip install transformers==4.37.0 gradio==4.19.2 && \
# Install vLLM-v2 dependencies
Expand All @@ -24,7 +46,16 @@ RUN apt-get update && \
pip install transformers_stream_generator einops tiktoken && \
# For pipeline serving support
pip install mpi4py fastapi uvicorn openai && \
pip install gradio # for gradio web UI
# for gradio web UI
pip install gradio && \
# Install internal oneccl && \
cd /tmp/ && \
wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \
bash oneccl_wks_installer_2024.0.0.2.sh && \
pip uninstall -y oneccl_bind_pt && \
pip install /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \
rm /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \
patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch

COPY ./vllm_online_benchmark.py /llm/
COPY ./vllm_offline_inference.py /llm/
Expand Down
208 changes: 208 additions & 0 deletions docker/llm/serving/xpu/docker/gradio_web_server.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
--- gradio_web_server.py 2024-06-20 14:21:48.013518726 +0800
+++ gradio_web_server_new.py 2024-06-20 14:23:09.822830709 +0800
@@ -9,8 +9,10 @@
import json
import os
import random
+import pandas as pd
import time
import uuid
+import numpy as np

import gradio as gr
import requests
@@ -241,7 +243,7 @@
ip = get_ip(request)
logger.info(f"clear_history. ip: {ip}")
state = None
- return (state, [], "", None) + (disable_btn,) * 5
+ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5


def get_ip(request: gr.Request):
@@ -354,6 +356,18 @@
return None


+def handle_latency_metrics(first_token_time, next_token_time):
+ # next token time is a numpy array...
+ # first token time might be None
+ first_token_latency = "None"
+ next_token_latency = "None"
+ if first_token_time is not None:
+ first_token_latency = str(first_token_time * 1000) + " ms"
+ if next_token_time.size > 0:
+ next_token_latency = str(np.mean(next_token_time) * 1000) + " ms"
+ return first_token_latency, next_token_latency
+
+
def bot_response(
state,
temperature,
@@ -372,7 +386,7 @@
if state.skip_next:
# This generate call is skipped due to invalid inputs
state.skip_next = False
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
return

if apply_rate_limit:
@@ -381,7 +395,7 @@
error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
state.conv.update_last_message(error_msg)
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
return

conv, model_name = state.conv, state.model_name
@@ -404,6 +418,10 @@
yield (
state,
state.to_gradio_chatbot(),
+ "None",
+ "None",
+ "None",
+ "None",
disable_btn,
disable_btn,
disable_btn,
@@ -444,18 +462,32 @@
)

conv.update_last_message("▌")
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+ # We probably need to change this method
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
+ prompt_tokens = 0
+ generated_tokens = 0
+ first_token_latency = None
+ next_token_latencies = np.array([])
+ start_time = time.time()

try:
for i, data in enumerate(stream_iter):
if data["error_code"] == 0:
+ prompt_tokens = data["usage"]["prompt_tokens"]
+ generated_tokens = data["usage"]["completion_tokens"]
output = data["text"].strip()
conv.update_last_message(output + "▌")
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+ if first_token_latency is None:
+ first_token_latency = time.time() - start_time
+ else:
+ next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
+ start_time = time.time()
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
else:
output = data["text"] + f"\n\n(error_code: {data['error_code']})"
conv.update_last_message(output)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -465,13 +497,14 @@
return
output = data["text"].strip()
conv.update_last_message(output)
- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
except requests.exceptions.RequestException as e:
conv.update_last_message(
f"{SERVER_ERROR_MSG}\n\n"
f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -484,7 +517,7 @@
f"{SERVER_ERROR_MSG}\n\n"
f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
)
- yield (state, state.to_gradio_chatbot()) + (
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
disable_btn,
disable_btn,
disable_btn,
@@ -646,7 +679,8 @@
)

notice_markdown = f"""
-# 🏔️ Chat with Open Large Language Models
+# 🏔️ ChatBot based Xeon-W & Arc GPUs
+### Deployed with IPEX-LLM
{promotion}
"""

@@ -691,6 +725,26 @@
regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
clear_btn = gr.Button(value="🗑️ Clear history", interactive=False)

+ with gr.Row():
+ with gr.Column():
+ gr.Markdown("### Performance Metrics")
+ prompt_token = gr.Textbox(
+ label="Prompt token length:",
+ interactive=False,
+ )
+ next_token = gr.Textbox(
+ label="Generated token length:",
+ interactive=False,
+ )
+ first_token_latency = gr.Textbox(
+ interactive=False,
+ label="First token Latency:",
+ )
+ next_token_latency = gr.Textbox(
+ interactive=False,
+ label="Next token Latency:",
+ )
+
with gr.Accordion("Parameters", open=False) as parameter_row:
temperature = gr.Slider(
minimum=0.0,
@@ -743,9 +797,9 @@
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)
- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)

model_selector.change(
clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
@@ -758,7 +812,7 @@
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)
send_btn.click(
add_text,
@@ -767,7 +821,7 @@
).then(
bot_response,
[state, temperature, top_p, max_output_tokens],
- [state, chatbot] + btn_list,
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
)

return [state, model_selector]
@@ -775,7 +829,7 @@

def build_demo(models):
with gr.Blocks(
- title="Chat with Open Large Language Models",
+ title="ChatBot based Xeon-W & Arc GPUs",
theme=gr.themes.Default(),
css=block_css,
) as demo:
14 changes: 14 additions & 0 deletions docker/llm/serving/xpu/docker/oneccl-binding.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp
index 3bd8087..c5b5ce3 100644
--- a/src/gpu/dpcpp_ccl.cpp
+++ b/src/gpu/dpcpp_ccl.cpp
@@ -689,7 +689,8 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::allreduce_(std::v
stream,
attr), stream.get_native());
});
- // printf("Use One CCL allreduce.\n");
+ stream.get_native().wait();
+ // printf("Use One CCL allreduce.\n");
return ret_evt;
},
c10d::OpType::ALLREDUCE);
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
cd /llm/lightweight_serving
model_path="/llm/models/Llama-2-7b-chat-hf"
low_bit="sym_int4"
source /opt/intel/1ccl-wks/setvars.sh
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
3 changes: 2 additions & 1 deletion docker/llm/serving/xpu/docker/start-pp_serving-service.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ export OMP_NUM_THREADS=32
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
basekit_root=/opt/intel/oneapi
source $basekit_root/setvars.sh --force
source $basekit_root/ccl/latest/env/vars.sh --force
# source $basekit_root/ccl/latest/env/vars.sh --force
source /opt/intel/1ccl-wks/setvars.sh

export USE_XETLA=OFF
if [[ $KERNEL_VERSION != *"6.5"* ]]; then
Expand Down
3 changes: 2 additions & 1 deletion docker/llm/serving/xpu/docker/start-vllm-service.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
model="YOUR_MODEL_PATH"
served_model_name="YOUR_MODEL_NAME"


source /opt/intel/1ccl-wks/setvars.sh

python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
--served-model-name $served_model_name \
--port 8000 \
Expand Down

0 comments on commit 86fc049

Please sign in to comment.